python 用代理简单的爬取ganji网租房信息

红太狼 2023-06-16 03:53 9阅读 0赞

python 用代理proxy 简单的爬取ganji网的租房信息。

##### 环境： #####

**1.python3.7**  
**2.requests 请求模块(pip instal request)**  
**3.pyquery 解析模块（pip install pyquery）**  
　　pyquery库是jQuery的Python实现，能够以jQuery的语法来操作解析 HTML 文档，易用性和解析速度都很好，和它差不多的还有BeautifulSoup，都是用来解析的。相比BeautifulSoup，因为之前使用过jquery.js 写过web网页，感觉pyquery更亲切。  
**4.logging 生成日志的模块(pip install logging)**  
　　本来不想用这个日志模块的，但是用print的话没法记录一些错误的信息，所有就用上了。

##### 步骤： #####

**1.获取代理ip地址；**  
只是练习的小demo,去抓取的ip都是免费的！！！  
**2.验证代理ip地址；**  
通过百度验证这些代理ip地址是否有用！  
**3.使用验证ok的代理ip地址抓取数据；**  
使用这些有用的ip地址来访问ganji网，获取一些信息！  
**4.demo的大概结构**  
![在这里插入图片描述][20191124104042620.png]

##### 详细步骤： #####

**1.获取代理ip地址；**  
由于ganji网非常敏感，经常弹出验证的信息，在不破解的情况下只能通过代理的方式来抓取信息。用单线程去提供代理的网站上去抓取前10页的HTTP/HTTPS的ip地址，整理后保存在json文件中。

import json, os
    import requests
    from pyquery import PyQuery as pq
    import logging
    import random
    
    _path = os.path.dirname(__file__)
    logger = logging.getLogger()
    
    logging.basicConfig(level=logging.INFO, filename=os.path.join(_path, 'log/get_ip_log.txt'))
    
    
    class Get_IP(object):
        """ 从免费的ip 代理网站上抓一些免费的ip地址构成字典,写到json文件中 """
        def __init__(self, url, result_ip_file):
            self.headers = { 
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'}
            self.request_s = requests.session()
            self.url = url
            self.result_ip_file = result_ip_file
            self.list_date = list()
    
        def write_get_ip_file(self):
            with open(self.result_ip_file, 'w', encoding='utf8') as f:
                json.dump(self.list_date, fp=f, ensure_ascii=False, indent=2)
    
        def get_ip_info(self, trs):
            # 提取每个 tr 的数据
            for tr in trs:
                ip = tr('td').eq(1).text().strip()   # ip
                port = tr('td').eq(2).text().strip()  # port
                type = tr('td').eq(5).text().strip()
    
                # if tr.attr('class') == 'subtitle':
                # logging.info('have class subtitle ---title') # 标题，不要
                # continue
                # if tr('a').text() == '更多':
                # logging.info('have a more---title') # 标题，不要
                # continue
                if tr('th'):
                    logging.info('page th ---title')  # 标题，不要
                    continue
    
                ip_live_time = tr('td').eq(6).text()
                if '分钟' in ip_live_time:
                    logging.info('ip [{}] live too short'.format(ip))  # ip存活太短，不要
                    continue
    
                if 'socks' in type:
                    logging.info('ip [{}] is socks'.format(ip))  # 不是http和https协议，不要
                    continue
    
                ip_dict = {   # 构造字典
                    '{}'.format(type): type + '://' + ip + port
                }
                logging.info('ok ip---->{}'.format(ip_dict))
    
                self.list_date.append(ip_dict)   # 添加到全局的列表中
    
        def get_response(self):
    
            response = self.request_s.get(self.url, headers=self.headers, timeout=60).content.decode()
            response = pq(response)
            trs = response('table>tr').items()
            self.get_ip_info(trs=trs)
    
            # response = self.read_file(file='IP-nn.html')
            for i in range(10):  # 国内高匿代理
                response = self.request_s.get(self.url+'nn/{}'.format(i), headers=self.headers, timeout=60).content.decode()
                response = pq(response)
                trs = response('table>tr').items()
                self.get_ip_info(trs=trs)
    
            for j in range(10):  # 国内普通代理
                response = self.request_s.get(self.url+'nt/{}'.format(j), headers=self.headers, timeout=60).content.decode()
                response = pq(response)
                trs = response('table>tr').items()
                self.get_ip_info(trs=trs)
    
            for n in range(10):  # 国内HTTPS代理
                response = self.request_s.get(self.url+'wn/{}'.format(n), headers=self.headers, timeout=60).content.decode()
                response = pq(response)
                trs = response('table>tr').items()
                self.get_ip_info(trs=trs)
    
            for m in range(10):  # 国内HTTP代理
                response = self.request_s.get(self.url+'wt/{}'.format(m), headers=self.headers, timeout=60).content.decode()
                response = pq(response)
                trs = response('table>tr').items()
                self.get_ip_info(trs=trs)
    
        def main(self):
            self.get_response()
            self.write_get_ip_file()
    
    
    if __name__ == '__main__':
        result_ip_file = os.path.join(_path, 'proxy_ip/get_ip_data.json')
        get_ip = Get_IP(url='https://www.xicidaili.com/', result_ip_file=result_ip_file)
        get_ip.main()

**2.验证代理ip地址；**  
通过www.baidu.com来验证这些获取到的代理ip是否真的有用，由于ip有好几千个，所以使用100个线程来验证，先读取获取到的ip,在用ip访问百度，然后看返回的状态是不是200（response.status\_code == 200）判断ip是否有用，有用就保存在另一个json文件中。

import json, os
    import requests
    import threading
    from pyquery import PyQuery as pq
    import logging
    import random
    
    _path = os.path.dirname(__file__)
    logging.basicConfig(level=logging.INFO, filename=os.path.join(_path, 'log/back_ip_log.txt'))
    
    
    class Check_IP(object):
        """ 通过 百度 验证抓到的免费的代理ip，ok的ip写到good_ip中，没有用的ip记到log中 """
        def __init__(self, check_ip_file, good_ip_file):
            self.headers = { 
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'}
            self.check_ip_file = check_ip_file   # 保存要检查的ip的json 文件
            self.good_ip_file = good_ip_file     # 验证ok 的文件的保存位置
            self.check_ip_lists = list()  # 要检查的所有ip
            self.good_ip_lists = list()  # 验证ok的所有ip
            self.get_test_ip_lock = threading.Lock()   # 取检查ip的锁
            self.save_good_ip = threading.Lock()        # # 存验证OK ip的锁
    
        def read_ip(self):
            with open(self.check_ip_file, 'r', encoding='utf8') as f:
                self.check_ip_lists = json.load(fp=f)
    
        def write_good_ip(self):
            with open(self.good_ip_file, 'w', encoding='utf8') as f:
                json.dump(self.good_ip_lists, fp=f, ensure_ascii=False, indent=2)
    
        def check_ip(self):
            while True:
                if len(self.check_ip_lists) == 0:
                    break
                self.get_test_ip_lock.acquire()
                ip_dict = self.check_ip_lists.pop()  # 从ip列表中取一个
                print(ip_dict)
                self.get_test_ip_lock.release()
                try:
                    response = requests.get('https://www.baidu.com/', headers=self.headers, proxies=ip_dict, timeout=20)
                    if response.status_code == 200:
                        print(response.status_code)
                        self.save_good_ip.acquire()
                        # self.write_good_ip(ip_dict=ip_dict) # 把好的ip保存到一个文件中
                        self.good_ip_lists.append(ip_dict)    # 把验证好的ip保存list中
                        self.save_good_ip.release()
                except Exception as ex:
                    print('ex is [{}]'.format(ex))
                    self.save_good_ip.acquire()
                    logging.info('{} is can not use'.format(ip_dict))
                    self.save_good_ip.release()
    
        def main(self):
            self.read_ip()  # 从文件中读取所有的ip
    
            # 用100个线程来验证这些ip是否有用
            ths = []
            for t in range(100):
                t = threading.Thread(target=self.check_ip)
                ths.append(t)
    
            for t in ths:
                t.start()
    
            for t in ths:
                t.join()
    
            self.write_good_ip()
    
    
    if __name__ == '__main__':
        check_ip_file = os.path.join(_path, 'proxy_ip/get_ip_data.json')
        good_ip_file = os.path.join(_path, 'proxy_ip/good_ip_file')
    
        get_ip = Check_IP(check_ip_file=check_ip_file, good_ip_file=good_ip_file)
        get_ip.main()

**3.使用验证ok的代理ip地址抓取数据**  
使用验证过的ip地址访问ganji 网, 使用pyquery解析响应内容，把结果保存在先保存json文件中。

import requests
    from pyquery import PyQuery as pq
    import threading
    import logging
    import time, random
    import json, os
    
    _path = os.path.dirname(__file__)
    
    logger = logging.getLogger()
    logging.basicConfig(level=logging.INFO, filename=os.path.join(_path, 'log/log.txt'))
    
    
    class GANJI(object):
        """ 用来爬取赶集网，租房每个详情页的信息的， 但是赶集网老是需要验证，使用代理也不行， 只抓取列表页的数据。。。。 """
        def __init__(self, start_url, proxy_ip_file,result_file):
            self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'}
            self.start_url = start_url
            self.result_file = result_file
            self.proxy_ip_file = proxy_ip_file
            self.request_s = requests.session()
    
            self.proxy_list = list()        # 存放ip代理的列表
    
        def get_lists_page_info(self, list_div):
    
            for item_doc in list_div:
    
                item_dict = dict(
                    title=item_doc('.dd-item.title a').text(),  # 标题
                    item_info_type=item_doc('.dd-item.size span').eq(0).text(),  # 户型
                    item_info_are=item_doc('.dd-item.size span').eq(2).text(),  # 面积
                    item_info_face=item_doc('.dd-item.size span').eq(4).text(),  # 面向，朝向
                    item_info_fitment=item_doc('.dd-item.size span').eq(6).text(),  # 装修情况
                    item_info_address1=item_doc('.dd-item.address').eq(0)('a').eq(0).text(),  # 一级地址
                    item_info_address2=item_doc('.dd-item.address').eq(0)('a').eq(1).text(),  # 二级地址
                    item_info_publisher=item_doc('.dd-item.address').eq(1)('span.address-eara').text(),  # 发布人
                    item_info_feature=item_doc('.dd-item.feature span').text(),  # 地铁情况
                    price=item_doc('.price span.num').text() + item_doc('.price span.yue').text(),  # 价格
    
                )
    
                self.save_info(item_dict)
    
        def get_page(self):
    
            response = self.request_s.get(self.start_url.format(1), headers=self.headers, proxies=random.choice(self.proxy_list))
            response = pq(response.content.decode())
    
            list_div = response('.f-main-list .f-list-item.ershoufang-list').items()
            self.get_lists_page_info(list_div=list_div)
    
            max_pages_number = response('.f-page .pageBox .pageBox a').eq(-2).text()  # 最大页码
            print(max_pages_number)
    
            for i in range(2, int(max_pages_number)+1):
                time.sleep(1)
                logging.info('{}'.format(self.start_url.format(i)))
                print(self.start_url.format(i))
                response = self.request_s.get(self.start_url.format(i), headers=self.headers,
                                              proxies=random.choice(self.proxy_list))
                response = pq(response.content.decode())
    
                list_div = response('.f-main-list .f-list-item.ershoufang-list').items()
                self.get_lists_page_info(list_div=list_div)
    
        def save_info(self, item_dict):
            with open(self.result_file, 'a+', encoding='utf8') as f:
                f.write(json.dumps(item_dict, ensure_ascii=False, indent=2) + ',' + '\n')
    
        def read_good_ip_file(self):
            with open(self.proxy_ip_file, 'r', encoding='utf8') as f:
                self.proxy_list = json.load(fp=f)
    
        def main(self):
            self.read_good_ip_file()   # 先读取有用的代理ip
            self.get_page()  # 获取信息
    
    
    if __name__ == '__main__':
        # start_url = 'http://gz.ganji.com/zufang/pn{}/' # 租房的url
        # start_url = 'http://gz.ganji.com/hezu/pn{}/' # 合租的url
        start_url = 'http://gz.ganji.com/ershoufang/pn{}/'      # 二手房的url
        proxy_ip_file = os.path.join(_path, 'proxy_ip/good_ip.json')
        result_file = os.path.join(_path, 'result/gz_ershoufang_ganji_info.json')
    
        gj = GANJI(start_url=start_url, proxy_ip_file=proxy_ip_file, result_file=result_file)
        gj.main()

[20191124104042620.png]: https://img-blog.csdnimg.cn/20191124104042620.png