python3 [爬虫入门实战]爬虫之scrapy爬取游天下南京短租房存mongodb

素颜马尾好姑娘i 2021-06-10 20:42 620阅读 0赞

总结:总的来说不是很难,只是提取的字段有些多。总共获取了一个120多个南京房租信息

这里写图片描述

1 爬取的item

  1. # -*- coding: utf-8 -*-
  2. # Define here the models for your scraped items
  3. #
  4. # See documentation in:
  5. # http://doc.scrapy.org/en/latest/topics/items.html
  6. import scrapy
  7. class YoutxnanjinItem(scrapy.Item):
  8. # define the fields for your item here like:
  9. # name = scrapy.Field()
  10. # pass
  11. # 房源名称
  12. homeName = scrapy.Field()
  13. # 房源链接
  14. homeLine = scrapy.Field()
  15. # 房租单价
  16. homeSinglePrice = scrapy.Field()
  17. # 房租地址
  18. homeAddress = scrapy.Field()
  19. # 房租近期信息
  20. homeDetai = scrapy.Field()
  21. # 满七天价格
  22. homeSeven = scrapy.Field()
  23. # 满30天价格
  24. homeThirth = scrapy.Field()
  25. # 房东
  26. homePerson = scrapy.Field()
  27. # 房东头像
  28. homePersonImg = scrapy.Field()
  29. # 房东头像链接
  30. homePersonLink = scrapy.Field()
  31. # 房子大图
  32. homePicBg = scrapy.Field()
  33. # 房子大图链接
  34. homePicLink = scrapy.Field()
  35. # 品牌店铺信息
  36. # homePinPai = scrapy.Field()
  37. # 明星房东
  38. # homeStarrPerson = scrapy.Field()

我就问:是不是注释很详细,。

2 spider里面的内容

  1. #encoding=utf8
  2. import scrapy
  3. from youtxNanJin.items import YoutxnanjinItem
  4. class NanJinDefault(scrapy.Spider):
  5. name = 'youtx'
  6. allowed_domains = ['youtx.com']
  7. start_urls = ["http://www.youtx.com/nanjing/longrent1-page{}".format(n) for n in range(0,6)]
  8. def parse(self, response):
  9. # print(response.body)
  10. node_list = response.xpath("//div[@class='duanzu houseList']/ul/li[@class='clearfix']")
  11. # print(node_list)
  12. for node in node_list:
  13. item = YoutxnanjinItem()
  14. homeName = node.xpath("./div[@class='houseInfo clearfix']/div[@class='house-tit clearfix']/h3/a/text()").extract()
  15. homeLink = node.xpath("./div[@class='houseInfo clearfix']/div[@class='house-tit clearfix']/h3/a/@href").extract()
  16. print(homeName)
  17. print(homeLink)
  18. # 单日价格
  19. homeSinglePrice = node.xpath("./div[@class='houseInfo clearfix']/div[@class='house-tit clearfix']/div[@class='house-price mt9']/span/span[@class='housePrice']/text()").extract()
  20. print(homeSinglePrice)
  21. # 获取房源地址
  22. homeAddress = node.xpath("./div[@class='houseInfo clearfix']/div[@class='houseInfo-left mt2']/p[@class='clearfix mt5']/text()").extract()
  23. # 房租信息
  24. homeDesc =node.xpath("./div[@class='houseInfo clearfix']/div[@class='houseInfo-left mt2']/p[@class='mt5']/text()").extract()
  25. homeDesc2 =node.xpath("./div[@class='houseInfo clearfix']/div[@class='houseInfo-left mt2']/p[@class='mt5']/span[2]/text()").extract()
  26. print(homeAddress)
  27. print(homeDesc)
  28. print(homeDesc2)
  29. # 满30天的信息
  30. homeThrty = node.xpath("./div[@class='houseInfo clearfix']/div[@class='house-tit clearfix']/div[@class='house-price mt9']/div[@class='mix12_5']/div[@class='discount']/div[@class='discount-price']/span//text()").extract()
  31. print(homeThrty)
  32. # 房东信息
  33. homePerson = node.xpath("./div[@class='houseInfo clearfix']/div[@class='agentInfo mt16']/p[1]/a/text()").extract()
  34. # 房东链接
  35. homePersonLink = node.xpath("./div[@class='houseInfo clearfix']/div[@class='agentInfo mt16']/p[1]/a/@href").extract()
  36. print(homePerson)
  37. print(homePersonLink)
  38. # 房源大图图片
  39. homeBigPic = node.xpath("./div[@class='house-img']/a[1]/img/@src").extract()
  40. homeBigPicLink = node.xpath("./div[@class='house-img']/a[1]/@href").extract()
  41. print(homeBigPic)
  42. print(homeBigPicLink)
  43. # 房东头像信息
  44. personPic = node.xpath("./div[@class='house-img']/a[2]/img/@src").extract()
  45. # 房东头像链接地址
  46. personPicLink = node.xpath("./div[@class='house-img']/a[2]/img/@href").extract()
  47. print(personPic)
  48. print(homePersonLink)
  49. item['homeName'] ="".join(homeName)
  50. item['homeLine'] ="".join(homeLink)
  51. item['homeSinglePrice'] ="".join(homeSinglePrice)
  52. item['homeAddress'] ="".join(homeAddress)
  53. item['homeDetai'] ="".join(homeDesc)+"".join(homeDesc2)
  54. # 这里的值暂时没有取出来
  55. item['homeSeven'] ="".join(homeThrty)
  56. item['homeThirth'] ="".join(homeThrty)
  57. item['homePerson'] ="".join(homePerson)
  58. item['homePersonImg'] ="".join(personPic)
  59. item['homePersonLink'] ="".join(homePersonLink)
  60. item['homePicBg'] ="".join(homeBigPic)
  61. item['homePicLink'] ="".join(homeBigPicLink)
  62. yield item

注意:里面xpath根据东西写的值比较长,可以提取出来,这里就暂时不进行提取了。

3 接下来是pipline管道流的地方

  1. # -*- coding: utf-8 -*-
  2. # Define your item pipelines here
  3. #
  4. # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  5. # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
  6. import json
  7. from scrapy.conf import settings
  8. import pymongo
  9. class YoutxnanjinPipeline(object):
  10. def process_item(self, item, spider):
  11. return item
  12. class YouTXMongo(object):
  13. def __init__(self):
  14. self.client = pymongo.MongoClient(host=settings['MONGO_HOST'], port=settings['MONGO_PORT'])
  15. self.db = self.client[settings['MONGO_DB']]
  16. self.post = self.db[settings['MONGO_COLL']]
  17. def process_item(self, item, spider):
  18. postItem = dict(item)
  19. self.post.insert(postItem)
  20. return item
  21. # 写入json文件
  22. class JsonWritePipline(object):
  23. def __init__(self):
  24. self.file = open('游天下南京.json','w',encoding='utf-8')
  25. def process_item(self,item,spider):
  26. line = json.dumps(dict(item),ensure_ascii=False)+"\n"
  27. self.file.write(line)
  28. return item
  29. def spider_closed(self,spider):
  30. self.file.close()

只要写了一次存数据方式,其他的跟着搬过来就行了,前提是不出毛病的情况下

4 settings里面的代码

这里主要是进行mongodb的配置,user-agent头信息的配置

  1. # -*- coding: utf-8 -*-
  2. # Scrapy settings for youtxNanJin project
  3. #
  4. # For simplicity, this file contains only settings considered important or
  5. # commonly used. You can find more settings consulting the documentation:
  6. #
  7. # http://doc.scrapy.org/en/latest/topics/settings.html
  8. # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
  9. # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
  10. BOT_NAME = 'youtxNanJin'
  11. SPIDER_MODULES = ['youtxNanJin.spiders']
  12. NEWSPIDER_MODULE = 'youtxNanJin.spiders'
  13. # Crawl responsibly by identifying yourself (and your website) on the user-agent
  14. # USER_AGENT = 'youtxNanJin (+http://www.yourdomain.com)'
  15. USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
  16. # Obey robots.txt rules
  17. ROBOTSTXT_OBEY = False
  18. # 配置mongoDB
  19. MONGO_HOST = "127.0.0.1" # 主机IP
  20. MONGO_PORT = 27017 # 端口号
  21. MONGO_DB = "YouTianXia" # 库名
  22. MONGO_COLL = "house_nanjin" # collection
  23. # Configure maximum concurrent requests performed by Scrapy (default: 16)
  24. #CONCURRENT_REQUESTS = 32
  25. # Configure a delay for requests for the same website (default: 0)
  26. # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
  27. # See also autothrottle settings and docs
  28. #DOWNLOAD_DELAY = 3
  29. # The download delay setting will honor only one of:
  30. #CONCURRENT_REQUESTS_PER_DOMAIN = 16
  31. #CONCURRENT_REQUESTS_PER_IP = 16
  32. # Disable cookies (enabled by default)
  33. COOKIES_ENABLED = False
  34. # Disable Telnet Console (enabled by default)
  35. #TELNETCONSOLE_ENABLED = False
  36. # Override the default request headers:
  37. #DEFAULT_REQUEST_HEADERS = {
  38. # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  39. # 'Accept-Language': 'en',
  40. #}
  41. # Enable or disable spider middlewares
  42. # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
  43. #SPIDER_MIDDLEWARES = {
  44. # 'youtxNanJin.middlewares.YoutxnanjinSpiderMiddleware': 543,
  45. #}
  46. # Enable or disable downloader middlewares
  47. # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
  48. #DOWNLOADER_MIDDLEWARES = {
  49. # 'youtxNanJin.middlewares.MyCustomDownloaderMiddleware': 543,
  50. #}
  51. # Enable or disable extensions
  52. # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
  53. #EXTENSIONS = {
  54. # 'scrapy.extensions.telnet.TelnetConsole': None,
  55. #}
  56. # Configure item pipelines
  57. # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
  58. ITEM_PIPELINES = {
  59. # 'youtxNanJin.pipelines.YoutxnanjinPipeline': 300,
  60. 'youtxNanJin.pipelines.YouTXMongo': 300,
  61. 'youtxNanJin.pipelines.JsonWritePipline': 300,
  62. }
  63. # Enable and configure the AutoThrottle extension (disabled by default)
  64. # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
  65. #AUTOTHROTTLE_ENABLED = True
  66. # The initial download delay
  67. #AUTOTHROTTLE_START_DELAY = 5
  68. # The maximum download delay to be set in case of high latencies
  69. #AUTOTHROTTLE_MAX_DELAY = 60
  70. # The average number of requests Scrapy should be sending in parallel to
  71. # each remote server
  72. #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
  73. # Enable showing throttling stats for every response received:
  74. #AUTOTHROTTLE_DEBUG = False
  75. # Enable and configure HTTP caching (disabled by default)
  76. # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
  77. #HTTPCACHE_ENABLED = True
  78. #HTTPCACHE_EXPIRATION_SECS = 0
  79. #HTTPCACHE_DIR = 'httpcache'
  80. #HTTPCACHE_IGNORE_HTTP_CODES = []
  81. #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

最后再来一张爬取下来的截图吧。相同的爬虫做着练习,以后要学会爬其他大一些的爬取不容易乱一些的网站。

这里写图片描述

发表评论

表情:
评论列表 (有 0 条评论,620人围观)

还没有评论,来说两句吧...

相关阅读