Python 爬取高清桌面壁纸-向日葵屋

Python 爬取高清桌面壁纸

今天写了一个脚本用来爬取ZOL桌面壁纸网站的高清图片；

链接：http://desk.zol.com.cn/1920x1080/

本程序只爬了美女板块的图片，若要下载其他板块，只需修改程序中的”meinv“即可

代码如下：

#coding=utf-8
import urllib  
import re
import time
class Spider:
    baseUrl='http://desk.zol.com.cn/'
    pic_index=0
    itemGroupPic=[]
    def __init__(self,page_count):
        time.sleep(1)
        url=self.baseUrl+"meinv/1920x1080/"
        for i in range(10, page_count):
            time.sleep(5)
            html=self.getHtml(url,i)
            getbi=self.getPageImageGroup(html) 
            bizhi_url=self.getbizhiurlList(getbi)
    def getHtml(self,url,page_index):
        url=url+str(page_index)+".html"
        page = urllib.urlopen(url)
        html = page.read()
        return html
    def getPageImageGroup(self,html):
        reg=r'<a class="pic" href="/bizhi/.*?.html'  
        imgre=re.compile(reg)
        imagelist=re.findall(imgre,html)
        return imagelist
    def getbizhiurlList(self,imagelist):
        for iurl in imagelist:
            reg=r'bizhi/.*?.html'  
            imgre=re.compile(reg)
            itmeimageurl=re.findall(imgre,iurl)
            self.itemGroupPic.append(itmeimageurl)
    def GetCurrentUrlAndDownload(self,url):
        page = urllib.urlopen(url)
        html = page.read() #read()出来的文本和网页右键源代码有点出入，这里需要优化
        reg=r'<img id="bigImg" src="http://.*.jpg"'  
        imgre=re.compile(reg)
        urllist=re.findall(imgre,html)
        for _u in urllist:
            reg1=r'http://.*.jpg'  
            imgre1=re.compile(reg1)
            itmeimageurl=re.findall(imgre1,_u)
            print u'正在下载'+str(self.pic_index)+u'图片'
            #D:\PictureAvi目录要事先创建好
            urllib.urlretrieve(itmeimageurl[0],'D:\PictureAvi\%s.jpg' % self.pic_index)
            self.pic_index+=1
        #获取当前页面的url,next_html
        next_reg=r'<a id="pageNext" class="next" href=".*.html"'  
        next_imgre=re.compile(next_reg)
        next_urllist=re.findall(next_imgre,html)
        if(len(next_urllist)==0):
            return ""
        #获取真正的next_html
        next_reg_child=r'bizhi.*?.html'  
        next_imgre_child=re.compile(next_reg_child)
        real_url=re.findall(next_imgre_child,next_urllist[0])
        return real_url[0]
    def MatchUrl(self,imagelist):
        for imgurl in imagelist:
            url=self.baseUrl+imgurl[0]
            next_url=self.GetCurrentUrlAndDownload(url)
            #递归获取下一个url
            while(next_url != ''):
                _itme_next_url=self.baseUrl+next_url
                next_url=self.GetCurrentUrlAndDownload(_itme_next_url)
if __name__ == '__main__':
    spider = Spider(15)
    spider.MatchUrl(spider.itemGroupPic)
    print u'结束下载'