fetch google arts and culture big images from urls

从goole arts and culture big images 抓取大图片,超高清图片,因为google arts and culture对一张具有很高艺术价值的图片的显示方式做过前端的分割处理,因此很难爬取到原图的url,这是一个将浏览器的页面设置到非常大然后截图的形式,同样具有非常高的清晰度。唯一的特征是截取之后的图片会占用大量的内存。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# This program is applied to take 4k images from google art and culture website
from selenium import webdriver
import os, shutil
import time as t
from PIL import Image, ImageChops
import tkinter as tk
from threading import Thread
from tkinter import filedialog
from ctypes import windll


exImg_value = 1

def is_picture(counter):
im = Image.open('temp/scrapping/image' + str(counter) + '.png')
rgb_im = im.convert('RGB')
r, g, b = rgb_im.getpixel((2000, 1300))
if r == 255 and g == 255 and b == 255:
return False
else:
return True


def is_same(counter):
if counter > 0:
prev_counter = counter - 1
new_file = os.path.getsize('temp/scrapping/image%s.png' % str(counter))
old_file = os.path.getsize('temp/scrapping/image%s.png' % str(prev_counter))
os.remove('temp/scrapping/image%s.png' % str(prev_counter))
if new_file == old_file:
return True
else:
return False


def trim(image):
bg = Image.new(image.mode, image.size, image.getpixel((0, 0)))
diff = ImageChops.difference(image, bg)
diff = ImageChops.add(diff, diff, 2.0, -100)
bbox = diff.getbbox()
if bbox:
return image.crop(bbox)


def remove(value, delete_chars):
for c in delete_chars:
value = value.replace(c, '')
return value

def file_save(name, status):
path = status
f = filedialog.asksaveasfile(mode='wb', defaultextension=".png", title="Saving picture", initialfile=name, filetypes=(("PNG high resolution image", "*.png"), ("all files", "*.*")))
if f is None:
return
if os.path.abspath(path) != f.name.replace('/', '\\'):
im = Image.open(path)
im.save(f)
os.remove(path)
f.close()
else:
pass


def initialize_folders():
if not os.path.exists('temp'):
os.makedirs('temp')
else:
shutil.rmtree('temp')
if not os.path.exists('temp/scrapping'):
os.makedirs('temp/scrapping')

def do_scrapping(url):
old_url = url
url = ''

for char in old_url:
if char == '?':
break
url += char

options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
driver = webdriver.Chrome(executable_path=r"chromedriver.exe", chrome_options=options)
driver.set_window_size(4000, 4000)
driver.get(url)
xPath3 = r".//html/body/div[3]/div[3]/div/div/div/div[3]/div" # img xPath
xPath2 = r".//html/body/div[3]/div[3]/div/div/div[2]/div[1]/div[2]/div[1]/div" # zoom xPath
xPath1 = r".//html/body/div[3]/div[3]/div/div/div[3]/div/content/span" # open img xPath
image_appeared = False # flag for starting click on image
image_zoom_taked = False
last_file = '' # last succeed file
driver.implicitly_wait(1)
try:
authorPic = driver.find_element_by_xpath(r'/html[1]/body[1]/div[3]/div[3]/div[1]/div[1]/div[6]/section[2]/div[1]/ul[1]/li[2]/a[1]').text # author of the picture xPath
except Exception:
authorPic = ''

try:
name_pic = driver.find_element_by_xpath(r'/html[1]/body[1]/div[3]/div[3]/div[1]/div[1]/div[6]/section[2]/div[1]/ul[1]/li[1]').text[7::] # name of the picture xPath
if authorPic != '':
name_pic = ' - ' + name_pic
except Exception:
name_pic = driver.title[0:-23]

name_file = authorPic + name_pic
name_file = remove(name_file, '\/:*?"<>|')
t.sleep(3)
for i in range(0, 45): # 45 attempts
t.sleep(1)
if image_appeared:
t.sleep(3)
if exImg_value == 1:
elem2 = driver.find_element_by_xpath(xPath1)
else:
elem2 = driver.find_element_by_xpath(xPath2)
elem3 = driver.find_element_by_xpath(xPath3)
driver.execute_script("arguments[0].click();", elem2)
driver.execute_script("arguments[0].click();", elem3)
t.sleep(3)
image_appeared = False
image_zoom_taked = True
else:
pass
driver.save_screenshot('temp/scrapping/image%s.png' % str(i))

if is_picture(i) and not image_zoom_taked:
image_appeared = True
if is_same(i):
last_file = 'temp/scrapping/image%s.png' % str(i)
break
driver.quit()
return last_file, name_file

def do_finally_changes(last_file, name_file):
if last_file != '':
shutil.copyfile(last_file, 'temp/image_result.png')
shutil.rmtree('temp/scrapping')
imOp = Image.open('temp/image_result.png')
if exImg_value == 1:
im = imOp.crop((0, 50, 4000, 4000)) # 20!8
else:
im = imOp
im = trim(im)
im.save("Ukiyo/" + name_file + '.png')
shutil.rmtree('temp')
return name_file
return 'An error occurred with processing image'

def start_process(index, url):
print("initialize_folders()")
initialize_folders()
print("do_scrapping({0})".format(url))
file, name = do_scrapping(url)
print("saving image {0}".format(str(url.split("/")[-1])+ '.png'))
status = do_finally_changes(file, str(url.split("/")[-1]))
# file_save(status + '.png', status + '.png')
with open("log.txt","a",encoding="utf8") as log_file:
log_file.write(str(index)+" : "+url+"\n")

def start():
with open("asserts.txt",'r',encoding="utf8") as read_file:
for index, line in enumerate(read_file.readlines()):
url = "https://artsandculture.google.com" + line.strip()
print(index, url)
start_process(index, url)
path = "Ukiyo"
if not os.path.exists(path):
os.makedirs(path)
start()

# start_process(0,"https://artsandculture.google.com/asset/sanjūrokkasen/RQEYzE71xKwOlQ")