scrapy downloading images

之前介绍了自己编写的多进程多线程实现,发现Scrapy这个框架更好,它拥有多线程的速度,并且会提示你有哪些图片没有爬取下来,完成了多少张图片的爬取之类的信息,而且使用起来非常方便,可以直接pip安装。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import scrapy
# import codecs
import os
from bingproxy import BingProxy

class ImagesSpider(scrapy.Spider):
name = "images"
dir_path = "huaban_bingproxy_big_images"
if not os.path.exists(dir_path):
os.makedirs(dir_path)
# allowed_domains = ["tyst.migu.cn"]
start_urls = []
bingProxy = BingProxy()

def start_requests(self):
with open('processing_threading_huaban_big_images_all_urls_part3.txt') as url_list:
for url in url_list:
url = url.strip()
#yield scrapy.Request(url = self.bingProxy.get_proxy_url(url), meta = {"origin_rul": url}, callback = self.parse )
if url != "" and url != None:
yield scrapy.Request(url = url, callback=self.parse,method="get")
#def __init__(self, urlfile=None,*args, **kwargs):
# super(MusicSpider, self).__init__(*args, **kwargs)
# uf = codecs.open(urlfile, 'r', 'utf-8')
# urls = [line.strip() for line in uf.readlines()]
#self.start_urls = urls


def parse(self, response):
path = "huaban_bingproxy_big_images" +"/"+response.url.split('/')[-1] + ".png"
# path = path.split('?')[0]
# self.logger.info('Saving mp3 %s', path)
with open(path, 'wb') as f:
f.write(response.body)