又一款老司机爬虫

昨晚打开网页逛啊逛的,不知道怎么就逛到某网站上去了。。。。
然后顺手写了个爬虫,代码写的有点渣。还是丢出来大家一起探讨下。
起初是打算一个个按文件夹分类的,后来发现看起来不过瘾,每个文件夹里就那个几张图片,于是就省略了。

先附一张程序运行图,证明能用

01

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#coding:utf-8
'''
Created on 2018年4月21日
图片地址:http://k1.lsxwk.com/1/20160403/e50ebb725cb31939210.jpg
@author: iceH
'''
import urllib.request
import re
import time
import multiprocessing

def open_url(url):
req = urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0')
response = urllib.request.urlopen(req)
html = response.read()
return html

def second_page(page_url):
try:
list = []
p = r'<a href="(/zipaitoupai/1[^"]+\.html)'
html = open_url(page_url).decode('utf-8')
page = re.findall(p, html)

for each in page:
page = 'http://aiyaolulu.org' + each
list.append(page)
return list

except:
print('出错了')
time.sleep(1)

def find_img(img_url):
try:
html = open_url(img_url).decode('utf-8')
p = r'<img src="(http[^"]+)'
img_list = re.findall(p, html)
return img_list

except:
print('出错了')
time.sleep(1)

def get_title(url):
try:
html = open_url(url).decode('utf-8')
p = r'<title>(.+?)_爱要撸'
title = re.findall(p, html)
title = ''.join(title)
# print(title)
return title

except:
print('出错了')
time.sleep(1)

def down_img(title_dir,second_url):
# os.mkdir(folder)
# os.chdir(folder)
f = 1
first_img_list = find_img(second_url)
# print(first_img_list)
try:
for img in first_img_list:
print('[*]正在下载[%s]的第%d张图片' % (title_dir, f))
filename = title_dir + str(f) + '.jpg'
# print(filename)
response = urllib.request.urlopen(img)
se_img = response.read()
with open(filename,'wb') as m:
m.write(se_img)
f += 1
f = 1
except:
print('[*]ε=ε=ε=ε=┌(; ̄◇ ̄)┘等待10秒后继续=====')
time.sleep(10)

def main(page):
url = 'http://aiyaolulu.org/zipaitoupai/list_%d.html' % page
page_url = second_page(url)
for url in page_url:
title_dir = get_title(url)
# print(title_dir,url)
down_img(title_dir,url)


if __name__ == '__main__':
multiprocessing.freeze_support() #防止打包后出错
text = '''
____ _ _ _
| _ \ _ (_) | | | |
| |_) | _ _ (_) _ ___ ___ | |__| |
| _ < | | | | | | / __|/ _ \| __ |
| |_) || |_| | _ | || (__| __/| | | |
|____/ \__, |(_)|_| \___|\___||_| |_|
__/ |
|___/

数据来源:http://aiyaolulu.org
'''
print('*' * 50)
print(text + '\033[0m\n')
print('*' * 50)
pool = multiprocessing.Pool()
pool.map(main, [i for i in range(1, 50)])

考虑到有的人没有安装PY环境,已经将程序打包成exe给大家了(劝各位别在直接桌面打开)

点击下载:aiyaolulu.exe

-------------本文结束感谢您的阅读-------------