downloading images from google

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116


"""
really used in fetching url from google images
"""
import re
from selenium import webdriver
import time
import os
import sys
import ast
import re
from bs4 import BeautifulSoup
import random
from selenium.webdriver.chrome.options import Options

import pandas as pd
import numpy as np
if __name__ == "__main__":a
pd_data = pd.read_excel("cleaning_data.xlsx", header=None)
row = 7
whole_data = []
for i_index in range(row):
if i_index != 1:
for line in pd_data[20:31][i_index].values:
if str(line).strip() != None and str(line).strip() != "" and str(line).strip() != "nan":
print(line.strip())
whole_data.append(line.strip())
print(len(whole_data))
length_whole_data = len(whole_data)

for index_whole_data in range(length_whole_data):
wikiart_path = whole_data[index_whole_data] #"wikiart"
original_url = 'https://www.google.co.jp/search?q=' + wikiart_path + '&safe=active&rlz=1C1GCEU_zh-CNJP821&source=lnms&tbm=isch&sa=X&ved=0ahUKEwjYhuif2dDeAhWLF3IKHQIvD1gQ_AUIFCgC&biw=1920&bih=1088'


temp_path = wikiart_path + "/" + "temp_google_img_asserts_all2.txt"
path = wikiart_path + "/" + "google_img_asserts_all2.txt"



# os.environ["PATH"] += os.pathsep + 'D:\google-art-downloader-master'
if not os.path.exists(wikiart_path):
os.makedirs(wikiart_path)
# option = webdriver.ChromeOptions()
# option.add_argument('--headless')
# option.add_argument('--disable-gpu')
# browser = webdriver.Chrome(chrome_options = option)
fireFoxOptions = webdriver.FirefoxOptions()
fireFoxOptions.set_headless()
browser = webdriver.Firefox(firefox_options=fireFoxOptions)

asserts_all=set()

mark_time = 0
last_value = 0

# ------------------test start------------------------

# browser.get(original_url)





now_len = 0
pre_len = 0
count_all = 0

try:
browser.get(original_url)
# js="var q=document.documentElement.scrollTop=100000"
# browser.execute_script(js)
while(True):
time.sleep(random.randint(1,3))
browser.execute_script("window.scrollBy(0,1500)")
# print(browser.find_element_by_xpath('//*[@id="smb"]'))

pageSource = browser.page_source
soup = BeautifulSoup(pageSource,'lxml')
asserts = soup.find_all('div', {"class":"rg_meta"})
for assert_value in asserts:
data = re.sub(r'<.*?>' ,"", str(assert_value))
data = ast.literal_eval(data)
# print(data.get("ou"))
with open(temp_path,'a',encoding="utf-8") as w_file:
w_file.write(str(data.get("ou")) + "\n")
asserts_all.add(str(data.get("ou")))
print(len(asserts_all))
now_len = len(asserts_all)
if now_len == pre_len:
count_all += 1
else:
count_all = 0

if count_all >=10:
break
if count_all == 8:
if browser.find_element_by_id("smb") != None and browser.find_element_by_id("smb") != "":
browser.find_element_by_id("smb").click()
pre_len = now_len

except Exception as e:
print("global",e)
finally:
with open(path,'w',encoding="utf8") as write_file:
for line in asserts_all:
write_file.write(str(line)+"\n")
# pass
browser.close()