文章目录

昨天嫂子说很怀念以前看过的两部漫画,让我帮忙从网上找一下。我想了想,认为使用gevent来抓取是个不错的选择,遂奋斗几小时将代码写出来了。可恶的漫画网站的js构建url,代码又是压缩过的,真难分析。

两代码总共有两个文件

getImgs.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#-*- encoding:utf-8 -*-
import gevent
from gevent import monkey
from gevent.pool import Pool
import os, urllib2

monkey.patch_all()

class ComicDown():
process = 5

def __init__(self, poolsize=5):
self.urls = []
self.pool = Pool(poolsize)#同时并发数

def addurl(self, url, headers, savefile):
self.pool.spawn(self.down, url, headers, savefile)#将下载任务添加到任务池

def down(self, url, headers, savefile):
try:
request = urllib2.Request(url)
for h in headers:
request.add_header(h[0],h[1])

opener = urllib2.build_opener()
img = opener.open(request).read()
open(savefile, 'wb').write(img)#将下载结果写入文件
print url,'Complete!'
except Exception, e:
print url, 'Error'
print e

def join(self):
self.pool.join()

cd = ComicDown()
def start(path):
f = open(os.path.join(path, 'link'))


line = f.readline()
while line:
urlinfo = {}
urlinfo['url'] = line.split('\t')[0]
urlinfo['headers'] = [['referer',line.split('\t')[1]]]
urlinfo['savefile'] = os.path.join(path,urlinfo['url'].split('/')[-1:][0])
line = f.readline()
if not os.path.exists(urlinfo['savefile']):
cd.addurl(**urlinfo)

getImgs.py 使用gevent的pool任务池下载图片。

getComic.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#coding:utf-8
from pyquery import PyQuery as pq
from optparse import OptionParser
import os, re, getImgs

def getparser():
parser = OptionParser()
parser.add_option('-u','--url', dest='url',help=u'要采集的漫画地址', metavar='url')
options, args = parser.parse_args()
return options, args

def getImgList(url):
'''获取图片地址列表'''
content = pq(url=url).html()
jpgs = re.findall(r"imanhua_\d{3}", content)
out = []
d1, d2 = re.findall(r'(\d{4}).*(\d{5})', url)[0]

for j in jpgs:
out.append('http://t5.imanhua.com/Files/Images/%s/%s/%s.jpg\thttp://www.imanhua.com/comic/%s/list_%s.html'%\
(d1,d2,j,d1,d2))
return out

d = pq(url=getparser()[0].url)
site = '/'.join(getparser()[0].url.split('/')[:3])
print 'site',site
title = d('div.bookInfo h1').text()
if not os.path.exists('./comic/%s'%title):
os.mkdir('./comic/%s'%title)#新建目录

print 'title', title
for i in d('ul#subBookList li a'):
tmp = i.values()
if not os.path.exists('./comic/%s/%s'%(title,tmp[1])):
os.mkdir('./comic/%s/%s'%(title,tmp[1]))
if not os.path.exists('./comic/%s/%s/link'%(title, tmp[1])):
jpglist = getImgList(site+tmp[0])
f = open('./comic/%s/%s/link'%(title, tmp[1]), 'wb')
for j in jpglist:
f.write(j+'\r\n')
f.close()
#print ('getImgs.py -p %s/comic/%s/%s'%(os.getcwd().decode('cp936'),title,tmp[1])).encode('cp936')
#os.system(('getImgs.py -p %s/comic/%s/%s'%(os.getcwd().decode('cp936'),title,tmp[1])).encode('cp936'))
try:
getImgs.start(('%s/comic/%s/%s'%(os.getcwd().decode('cp936'),title,tmp[1])).encode('cp936'))
except:
print u'%s 错误'%tmp[1]

getImgs.cd.join()

getComic.py 使用pyquery和re模块分析页面,得到漫画相关信息和图片下载地址。

tip

此代码是在windows下编写的,故代码中对文件系统地址的编码使用了cp936。

文章目录