1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
""" really used in fetching url from google images """ import re from selenium import webdriver import time import os import sys import ast import re from bs4 import BeautifulSoup import random from selenium.webdriver.chrome.options import Options
import pandas as pd import numpy as np if __name__ == "__main__":a pd_data = pd.read_excel("cleaning_data.xlsx", header=None) row = 7 whole_data = [] for i_index in range(row): if i_index != 1: for line in pd_data[20:31][i_index].values: if str(line).strip() != None and str(line).strip() != "" and str(line).strip() != "nan": print(line.strip()) whole_data.append(line.strip()) print(len(whole_data)) length_whole_data = len(whole_data)
for index_whole_data in range(length_whole_data): wikiart_path = whole_data[index_whole_data] original_url = 'https://www.google.co.jp/search?q=' + wikiart_path + '&safe=active&rlz=1C1GCEU_zh-CNJP821&source=lnms&tbm=isch&sa=X&ved=0ahUKEwjYhuif2dDeAhWLF3IKHQIvD1gQ_AUIFCgC&biw=1920&bih=1088'
temp_path = wikiart_path + "/" + "temp_google_img_asserts_all2.txt" path = wikiart_path + "/" + "google_img_asserts_all2.txt"
if not os.path.exists(wikiart_path): os.makedirs(wikiart_path) fireFoxOptions = webdriver.FirefoxOptions() fireFoxOptions.set_headless() browser = webdriver.Firefox(firefox_options=fireFoxOptions)
mark_time = 0 last_value = 0
now_len = 0 pre_len = 0 count_all = 0
try: browser.get(original_url) while(True): time.sleep(random.randint(1,3)) browser.execute_script("window.scrollBy(0,1500)")
pageSource = browser.page_source soup = BeautifulSoup(pageSource,'lxml') asserts = soup.find_all('div', {"class":"rg_meta"}) for assert_value in asserts: data = re.sub(r'<.*?>' ,"", str(assert_value)) data = ast.literal_eval(data) with open(temp_path,'a',encoding="utf-8") as w_file: w_file.write(str(data.get("ou")) + "\n") asserts_all.add(str(data.get("ou"))) print(len(asserts_all)) now_len = len(asserts_all) if now_len == pre_len: count_all += 1 else: count_all = 0
if count_all >=10: break if count_all == 8: if browser.find_element_by_id("smb") != None and browser.find_element_by_id("smb") != "": browser.find_element_by_id("smb").click() pre_len = now_len
except Exception as e: print("global",e) finally: with open(path,'w',encoding="utf8") as write_file: for line in asserts_all: write_file.write(str(line)+"\n") browser.close()