Python 爬取必应(壁纸+搜索词)
爬取必应壁纸
经常使用必应应该可以发现,其主页每天都会更新一张图片,这些图片很好看,希望每天能够下载收藏每张图片。具体请看这个网站:必应每日高清壁纸(https://bing.ioliu.cn/)
效果如下:
代码如下:
import requests
import re
import os
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
def get_page(num):
page_list = []
for i in range(1, num+1):
url = f'https://bing.ioliu.cn/?p={ i}'
page_list.append(url)
return page_list
def get_html(url):
r = requests.get(url, headers=headers)
html = r.text
return html
def parse_html(html):
pattern1 = re.compile(r'data-progressive.*?src="(.*?)"')
pattern2 = re.compile(r'<h3>(.*?)</h3>')
img_list = re.findall(pattern1, html)
title_list = re.findall(pattern2, html)
return img_list, title_list
def download(path, img_list, title_list):
for i in range(len(img_list)):
img_url = img_list[i]
title = title_list[i]
img_url = img_url.replace('640', '1920').replace('480', '1080')
pattern3 = re.compile(r'[()-/_]')
title = re.sub(pattern3, '', title)
print(f'正在爬取: { img_url}')
img_floder = 'D:/图片/'+keyword
if not os.path.exists(img_floder):
os.makedirs(img_floder)
with open(f'{ img_floder}/{ title}.jpg', 'wb') as f:
img_content = requests.get(img_url).content
f.write(img_content)
# 将爬取失败的删除
if os.path.getsize(img_path) < 50:
os.remove(img_path)
if __name__ == '__main__':
num = 20
keyword = '必应壁纸'
path = 'D:/图片/'
page_list = get_page(num)
for page in page_list:
html = get_html(page)
img_list, title_list = parse_html(html)
download(path, img_list, title_list)
根据搜索词爬取必应图片
这里需要注意: requests.get(url, headers=headers).text
会有很多 html 转义编码的字符,比如:引号变为"
,会影响使用正则
解决方法:
- 正则中加入
"
- 使用
etree.HTML
重新加载一下,再用xpath
定位到此处
出现问题:
请求超时
设置请求超时时间,防止长时间停留在同一个请求
socket.setdefaulttimeout(10)
requests.exceptions.ConnectionError: HTTPConnectionPool(host='www.iutour.cn', port=80):
Max retries exceeded with url: /uploadfile/bjzb/20141126124539763.jpg
(Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001A46192EC50>:
Failed to establish a new connection: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。',))
需要验证证书
requests.get(img_url, verify=False)
requests.exceptions.SSLError: HTTPSConnectionPool(host='bbp.jp', port=443):
Max retries exceeded with url: /wp-content/uploads/2016/05/2-20.jpg
(Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')],)",),))
直接使用 try:catch
import requests
import re
import os
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
def get_page(num):
img_list = []
for i in range((num // 35) + 1):
url = f'https://cn.bing.com/images/async?q={ keyword}&first={ i*35}&count=35&relp=35&scenario=ImageBasicHover&datsrc=I&layout=RowBased_Landscape&mmasync=1'
r = requests.get(url, headers=headers)
html = r.text
html = etree.HTML(html)
conda_list = html.xpath('//a[@class="iusc"]/@m')
for j in conda_list:
pattern = re.compile(r'"murl":"(.*?)"')
img_url = re.findall(pattern, j)[0]
img_list.append(img_url)
return img_list
def download(path, img_list):
for i in range(len(img_list)):
img_url = img_list[i]
print(f'正在爬取: { img_url}')
img_floder = 'D:/图片/'+keyword
if not os.path.exists(img_floder):
os.makedirs(img_floder)
try:
with open(f'{ img_floder}/{ i}.jpg', 'wb') as f:
img_content = requests.get(img_url).content
f.write(img_content)
except:
continue
if __name__ == '__main__':
num = 100
keyword = '食品街'
path = 'D:/图片/'
img_list = get_page(num)
download(path, img_list)
还没有评论,来说两句吧...