1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
| from urllib import request from lxml import etree import time
for pagenum in range(330,400): #伪造头信息 headers = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36" } #网址 url = r"https://www.baidu.com"+str(pagenum)+'.html' #获取网页源代码 response = request.Request(url=url,headers=headers) res = request.urlopen(response) html = res.read().decode('utf-8') #根据xpath获取图片地址 name_list = etree.HTML(html).xpath("/html/body/div[2]/div[1]/div[3]/div/p/img/@src")[0] response2 = request.Request(url=name_list,headers=headers) res = request.urlopen(response2).read() #将图片写入硬盘 with open(str(pagenum)+'.webp','wb') as f: f.write(res) #等待1s,避免网站会因为访问过快导致访问失败。 time.sleep(1)
|
推荐python3运行该脚本,自动访问网页分页,分析地址中包含的图片地址,并保存到硬盘。