Using multiple processing to download images based on image URLS

本文中实现的是在获取到所有大数据量的图片urls信息之后,如果抓取图片到本地的方式。因为单机单线程的方式效率非常低,因此考虑到这种情况便编写了一个多进程多线程的爬虫方式,可以很快速的以超n倍单线程的速度进行图片爬取。(后来发现Scrapy框架也能够以超高的速度进行下载)。由于本人在微软实习需要的测试数据量是非常巨大的,这种情况肯定是会被网站封杀和禁止的,好在内部集成了一个几千台的电脑集群,可以通过一个接口调用进行任务分担的情况。因为是接口的形式,所以也可以在该代码中应用,只需要修改downloading_images函数即可。读者若有类似的情况可以参考。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#coding=utf-8
from time import sleep, ctime
import threading
import urllib.request
import os
import time

print("program start %s"%ctime())

path = "huaban_tem_images"
if not os.path.exists(path):
os.makedirs(path)

url_set = []
filename_set = []
temp_url_set = []
temp_filename_set = []
with open("images_urls.txt",'r',encoding="utf8") as read_file:
all_lines = read_file.readlines()
print("number: ",len(all_lines))
len_all_lines = len(all_lines)
for index, line in enumerate(all_lines):
url = line.strip()
temp_url_set.append(url)
file_name = path + "/" +str(url.split("/")[-1]) + ".png"
temp_filename_set.append(file_name)
if (index + 1) % 10 == 0 or index == (len_all_lines - 1):
url_set.append(temp_url_set)
filename_set.append(temp_filename_set)
temp_url_set = []
temp_filename_set = []



def downloading_images(url, filename):
# print("start downloading", url)
urllib.request.urlretrieve(url, filename = filename)
sleep(1)
print("end downloading", url)



if __name__ == '__main__':
epoch = 1
for (urls, filenames) in zip(url_set, filename_set):
threads = []
for (url, filename) in zip(urls, filenames):
t = threading.Thread(target=downloading_images, args=(url, filename))
threads.append(t)
for i in range(len(threads)):
threads[i].start()

for i in range(len(threads)):
threads[i].join()
print("epoch %d finished in %s"%(epoch, ctime()))
epoch += 1

print('program end:%s' %ctime())