python爬虫——爬取房天下

柔情只为你懂 2021-07-27 00:39 953阅读 0赞

python爬虫——爬取房天下

话不多说,直接上代码!

  1. import requests as req
  2. import time
  3. import pandas as pd
  4. from bs4 import BeautifulSoup
  5. from sqlalchemy import create_engine
  6. global info
  7. def getHouseInfo(url):
  8. info = { }
  9. soup = BeautifulSoup(req.get(url).text,"html.parser")
  10. resinfo = soup.select(".tab-cont-right .trl-item1")
  11. # 获取户型、建筑面积、单价、朝向、楼层、装修情况
  12. for re in resinfo:
  13. tmp = re.text.strip().split("\n")
  14. name = tmp[1].strip()
  15. if("朝向" in name):
  16. name = name.strip("进门")
  17. if("楼层" in name):
  18. name = name[0:2]
  19. if("地上层数" in name):
  20. name = "楼层"
  21. if("装修程度" in name):
  22. name = "装修"
  23. info[name] = tmp[0].strip()
  24. xiaoqu = soup.select(".rcont .blue")[0].text.strip()
  25. info["小区名字"] = xiaoqu
  26. zongjia = soup.select(".tab-cont-right .trl-item")
  27. info["总价"] = zongjia[0].text
  28. return info
  29. domain = "http://esf.anyang.fang.com/"
  30. city = "house/"
  31. #获取总页数
  32. def getTotalPage():
  33. res = req.get(domain+city+"i31")
  34. soup = BeautifulSoup(res.text, "html.parser")
  35. endPage = soup.select(".page_al a").pop()['href']
  36. pageNum = endPage.strip("/").split("/")[1].strip("i3")
  37. print("loading.....总共 "+pageNum+" 页数据.....")
  38. return pageNum
  39. # 分页爬取数据
  40. def pageFun(i):
  41. pageUrl = domain + city + "i3" +i
  42. print(pageUrl+" loading...第 "+i+" 页数据.....")
  43. res = req.get(pageUrl)
  44. soup = BeautifulSoup(res.text,"html.parser")
  45. houses = soup.select(".shop_list dl")
  46. pageInfoList = []
  47. for house in houses:
  48. try:
  49. # print(domain + house.select("a")[0]['href'])
  50. info = getHouseInfo(domain + house.select("a")[0]['href'])
  51. pageInfoList.append(info)
  52. print(info)
  53. except Exception as e:
  54. print("---->出现异常,跳过 继续执行",e)
  55. df = pd.DataFrame(pageInfoList)
  56. return df
  57. connect = create_engine("mysql+pymysql://root:root@localhost:3306/houseinfo?charset=utf8")
  58. for i in range(1,int(getTotalPage())+1):
  59. try:
  60. df_onePage = pageFun(str(i))
  61. except Exception as e:
  62. print("Exception",e)
  63. pd.io.sql.to_sql(df_onePage, "city_house_price", connect, schema="houseinfo", if_exists="append")

运行结果:
在这里插入图片描述
在这里插入图片描述

发表评论

表情:
评论列表 (有 0 条评论,953人围观)

还没有评论,来说两句吧...

相关阅读