taking image urls from pinterest

采用的是瀑布流的形式展现图片内容,无需用户翻页,新的图片不断自动加载在页面底端,让用户不断的发现新的图片。
Pinterest堪称图片版的Twitter,网民可以将感兴趣的图片在Pinterest保存,其他网友可以关注,也可以转发图片。索尼等许多公司也在Pinterest建立了主页,用图片营销旗下的产品和服务。本文实现了对Pinterest图片的爬取工作。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86


"""
really used in fetching url from https://artsandculture.google.com/entity/m0bwbv?categoryid=art-movement
"""
import re
from selenium import webdriver
import time
import os
import sys
import re
from bs4 import BeautifulSoup
import random
from selenium.webdriver.chrome.options import Options

temp_path ="temp_chinese_pinterest_img_asserts_all2.txt"
path ="chinese_pinterest_img_asserts_all2.txt"

wikiart_path = 'chinese-painting' #"wikiart"
original_url = 'https://www.pinterest.jp/jimmyyeji/%E4%B8%AD%E5%9B%BD%E4%B9%A6%E7%94%BB-chinese-painting/' # 'https://www.wikiart.org/en/paintings-by-style/cubism?select=featured#!#filterName:featured,viewType:masonry'
# os.environ["PATH"] += os.pathsep + 'D:\google-art-downloader-master'
if not os.path.exists(wikiart_path):
os.makedirs(wikiart_path)
# option = webdriver.ChromeOptions()
# option.add_argument('--headless')
# option.add_argument('--disable-gpu')
# browser = webdriver.Chrome(chrome_options = option)
fireFoxOptions = webdriver.FirefoxOptions()
fireFoxOptions.set_headless()
browser = webdriver.Firefox(firefox_options=fireFoxOptions)

asserts_all=set()

mark_time = 0
last_value = 0

# ------------------test start------------------------

# browser.get(original_url)





now_len = 0
pre_len = 0
count__all = 0

try:
browser.get(original_url)
# js="var q=document.documentElement.scrollTop=100000"
# browser.execute_script(js)
while(True):
time.sleep(random.randint(1,3))
browser.execute_script("window.scrollBy(0,300)")
pageSource = browser.page_source
soup = BeautifulSoup(pageSource,'lxml')
asserts = soup.find_all('img')
for assert_value in asserts:
if assert_value.get("src") != None and assert_value.get("src") != "" and assert_value.get("src").find("236x") != -1:
print(re.sub(r'236x',"originals",assert_value.get("src")))
with open(temp_path,'a',encoding="utf-8") as w_file:
w_file.write(str(re.sub(r'236x',"originals",assert_value.get("src"))) + "\n")
asserts_all.add(re.sub(r'236x',"originals",assert_value.get("src")))
print(len(asserts_all))
now_len = len(asserts_all)
if now_len == pre_len:
count_all += 1
else:
count_all = 0

if count_all >=10:
break
pre_len = now_len
with open(path,'w',encoding="utf8") as write_file:
for line in asserts_all:
write_file.write(str(line)+"\n")
except Exception as e:
print("global",e)
finally:
browser.close()