fetch huaban big image urls

本文中实现的是采用selenium框架爬取huaban网站的图片的urls,方便下一步的下载操作。Selenium是一个动态爬取框架,采用模拟浏览器行为,通过模拟人工控制浏览器行为的一个框架,具体需要网站的数据分布呈现一定的规律性才比较方便。Selenium能够处理静态爬虫爬不到的内容,比如js动态加载之后才能显示的图片。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import os
from bs4 import BeautifulSoup
# os.environ["PATH"] += os.pathsep + 'D:\google-art-downloader-master'

chrome_options = Options()
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--headless")

images_all = set()
browser = webdriver.Chrome(chrome_options = chrome_options)
# browser = webdriver.PhantomJS()
try:
with open("huaban_pin_asserts_all.txt",'r',encoding="utf8") as read_file:
for index, line in enumerate(read_file.readlines()):
url = "http://huaban.com" + line.strip()
browser.get(url,)
browser.set_page_load_timeout(10000)
browser.set_script_timeout(10000)#这两种设置都进行才有效
time.sleep(1)
print(index, url)

try:
img1 = browser.find_element_by_xpath('//*[@id="baidu_image_holder"]/a/img')
if img1 != None:
images_all.add(img1.get_attribute('src'))
except Exception as e:
pass


try:
img2 = browser.find_element_by_xpath('//*[@id="baidu_image_holder"]/img')
if img2 != None:
images_all.add(img2.get_attribute('src'))
except Exception as e:
pass
time.sleep(1)
with open("huaban_images_all.txt",'w',encoding="utf8") as write_file:
for line in images_all:
write_file.write(str(line) + "\n")
except Exception as e:
browser.close()