文章目录

想找动画片看,但那个肉丸网盘下载站的完结动漫没有按点击量排序,所以就把列表采集下来自己排。代码里用了gevent和urllib2,留着给自己以后参考。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#-*- encoding:utf-8 -*-
# 作者:chronos

#完结作品的列表
urls = [('http://d.52rwdm.com/search.asp?page=%d&searchword=完结&searchtype=2'%i).decode('utf8').encode('gbk') for i in range(1,9)]

import gevent
from gevent import monkey
import re

monkey.patch_all()

import urllib2

class Comic():
process = 5#同时并发数

def __init__(self, urls):
self.urls = urls
self.urls.reverse()
self.rslist = []#保存结果
for p in range(self.process):
self.down()

def down(self):
try:
#添加到下载
gevent.joinall([gevent.spawn(self.getPage, self.urls.pop())])
except:
pass

def buildList(self, data):
tmplist = re.findall(ur'<a href=\"\/view\/\d+.html\"><b>(.*?)<\/b>\[完结\]<span>\[.*?\]<\/span><span>\((\d+)点击\)<\/span>', data.decode('gbk','ignore'))
self.rslist.extend(tmplist)

def getPage(self, url):
data = urllib2.urlopen(url).read()
print url,u'下载完成'.encode('gbk')
#下载完一个,开始下载另一个
self.down()
#处理数据
self.buildList(data)

def cmp(self, a, b):
'''对比大小,用于排序'''
if int(a[1]) > int(b[1]):
return 1
elif int(a[1]) == int(b[1]):
return 0
else:
return -1

def sortlist(self):
'''对结果进行排序'''
self.rslist.sort(self.cmp, reverse=True)

c = Comic(urls)
c.sortlist()

f = open('CompleteComic.txt','w+')
for comic in c.rslist:
try:
f.write((u'%s, %s\r\n'%(comic[1], comic[0])).encode('gbk'))
except:
print comic[0]
f.close()
文章目录