之前介绍了自己编写的多进程多线程实现,发现Scrapy这个框架更好,它拥有多线程的速度,并且会提示你有哪些图片没有爬取下来,完成了多少张图片的爬取之类的信息,而且使用起来非常方便,可以直接pip安装。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
|
import scrapy
import os from bingproxy import BingProxy
class ImagesSpider(scrapy.Spider): name = "images" dir_path = "huaban_bingproxy_big_images" if not os.path.exists(dir_path): os.makedirs(dir_path)
start_urls = [] bingProxy = BingProxy()
def start_requests(self): with open('processing_threading_huaban_big_images_all_urls_part3.txt') as url_list: for url in url_list: url = url.strip() if url != "" and url != None: yield scrapy.Request(url = url, callback=self.parse,method="get")
def parse(self, response): path = "huaban_bingproxy_big_images" +"/"+response.url.split('/')[-1] + ".png" with open(path, 'wb') as f: f.write(response.body)
|