1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
|
""" really used in fetching url from https://artsandculture.google.com/entity/m0bwbv?categoryid=art-movement """ import re from selenium import webdriver import time import os import sys import re from bs4 import BeautifulSoup import random from selenium.webdriver.chrome.options import Options
temp_path ="temp_chinese_pinterest_img_asserts_all2.txt" path ="chinese_pinterest_img_asserts_all2.txt"
wikiart_path = 'chinese-painting' original_url = 'https://www.pinterest.jp/jimmyyeji/%E4%B8%AD%E5%9B%BD%E4%B9%A6%E7%94%BB-chinese-painting/'
if not os.path.exists(wikiart_path): os.makedirs(wikiart_path)
fireFoxOptions = webdriver.FirefoxOptions() fireFoxOptions.set_headless() browser = webdriver.Firefox(firefox_options=fireFoxOptions)
asserts_all=set()
mark_time = 0 last_value = 0
now_len = 0 pre_len = 0 count__all = 0
try: browser.get(original_url)
while(True): time.sleep(random.randint(1,3)) browser.execute_script("window.scrollBy(0,300)") pageSource = browser.page_source soup = BeautifulSoup(pageSource,'lxml') asserts = soup.find_all('img') for assert_value in asserts: if assert_value.get("src") != None and assert_value.get("src") != "" and assert_value.get("src").find("236x") != -1: print(re.sub(r'236x',"originals",assert_value.get("src"))) with open(temp_path,'a',encoding="utf-8") as w_file: w_file.write(str(re.sub(r'236x',"originals",assert_value.get("src"))) + "\n") asserts_all.add(re.sub(r'236x',"originals",assert_value.get("src"))) print(len(asserts_all)) now_len = len(asserts_all) if now_len == pre_len: count_all += 1 else: count_all = 0 if count_all >=10: break pre_len = now_len with open(path,'w',encoding="utf8") as write_file: for line in asserts_all: write_file.write(str(line)+"\n") except Exception as e: print("global",e) finally: browser.close()
|