java实现爬虫爬取京东手机页面

喜欢ヅ旅行 2022-02-04 00:59 480阅读 1赞

java实现爬虫利用httpclient获取页面数据,再用jsoup解析获取数据,在此页面的分析不作过多赘述,读者自行分析.

1.首页输入手机,观察url参数,将其中”&page=”提取到最后,便于拼接页码参数;

2.查看网页源代码,观看到商品列表标签,所以第一步,获取商品列表,Elements spuEles = doc.select( “div#J_goodsList>ul>li” );第二步,获取spu,long spu = Long.parseLong( spuEle.attr( “data-spu” ) );第三步,获取sku列表,Elements skuEles = spuEle.select( “li.ps-item” );第四步,获取单个sku,long sku = Long.parseLong( skuEle.select( “[data-sku]“ ).attr( “data-sku” ) );之后的操作就都是通过这个sku来操作的;

3.代码:

(1)添加依赖

  1. <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
  2. <dependency>
  3. <groupId>org.apache.httpcomponents</groupId>
  4. <artifactId>httpclient</artifactId>
  5. </dependency>
  6. <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
  7. <dependency>
  8. <groupId>org.jsoup</groupId>
  9. <artifactId>jsoup</artifactId>
  10. <version>1.11.3</version>
  11. </dependency>
  12. <!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
  13. <dependency>
  14. <groupId>org.apache.commons</groupId>
  15. <artifactId>commons-lang3</artifactId>
  16. <version>3.4</version>
  17. </dependency>
  18. <!-- jpa -->
  19. <dependency>
  20. <groupId>org.springframework.boot</groupId>
  21. <artifactId>spring-boot-starter-data-jpa</artifactId>
  22. </dependency>

数据库等其他依赖就不贴了;

(2)工具类封装

  1. @Component
  2. public class HttpUtils {
  3. private PoolingHttpClientConnectionManager cm;
  4. public HttpUtils() {
  5. this.cm = new PoolingHttpClientConnectionManager();
  6. // 设置最大连接数
  7. this.cm.setMaxTotal( 100 );
  8. // 设置每个主机的最大连接数
  9. this.cm.setDefaultMaxPerRoute( 10 );
  10. }
  11. // 根据请求地址下载页面数据
  12. public String doGetHtml( String url ) {
  13. // 获取httpclient对象
  14. CloseableHttpClient httpClient = HttpClients.custom()
  15. .setConnectionManager( this.cm )
  16. .build();
  17. // 设置httpGet请求对象,设置url地址
  18. HttpGet httpGet = new HttpGet( url );
  19. httpGet.addHeader( "user-agent", "Mozilla/5.0" );
  20. // 设置请求信息
  21. httpGet.setConfig( this.getConfig() );
  22. CloseableHttpResponse response = null;
  23. try {
  24. // 使用httpclient发起请求,回去响应
  25. response = httpClient.execute( httpGet );
  26. // 解析响应.返回结果
  27. if ( response.getStatusLine()
  28. .getStatusCode() == 200 ) {
  29. // 判断响应体Entity是否不为空,如果不为空就可以使用EntityUtils
  30. if ( response.getEntity() != null ) {
  31. String content = EntityUtils.toString( response.getEntity(), "utf-8" );
  32. return content;
  33. }
  34. }
  35. } catch( Exception e ) {
  36. e.printStackTrace();
  37. } finally {
  38. try {
  39. // 关闭response
  40. if ( response != null ) {
  41. response.close();
  42. }
  43. } catch( IOException e ) {
  44. e.printStackTrace();
  45. }
  46. }
  47. // 返回空字符串
  48. return "";
  49. }
  50. /**
  51. * //下载图片名称
  52. *
  53. * @param url
  54. * @return
  55. */
  56. public String doGetImage( String url ) {
  57. // 获取httpclient对象
  58. CloseableHttpClient httpClient = HttpClients.custom()
  59. .setConnectionManager( this.cm )
  60. .build();
  61. // 设置httpGet请求对象,设置url地址
  62. HttpGet httpGet = new HttpGet( url );
  63. httpGet.addHeader( "user-agent", "Mozilla/5.0" );
  64. // 设置请求信息
  65. httpGet.setConfig( this.getConfig() );
  66. CloseableHttpResponse response = null;
  67. try {
  68. // 使用httpclient发起请求,回去响应
  69. response = httpClient.execute( httpGet );
  70. // 解析响应.返回结果
  71. if ( response.getStatusLine()
  72. .getStatusCode() == 200 ) {
  73. // 判断响应体Entity是否不为空,如果不为空就可以使用EntityUtils
  74. if ( response.getEntity() != null ) {
  75. // 下载图片
  76. // 获取图片的后缀
  77. String extName = url.substring( url.lastIndexOf( "." ) );
  78. // 创建图片名,重命名图片
  79. String picName = UUID.randomUUID()
  80. .toString() +
  81. extName;
  82. // 下载图片
  83. // 声明OutPutStream
  84. OutputStream outPutStream = new FileOutputStream( new File( "D:\\driver\\images\\" + picName ) );
  85. response.getEntity()
  86. .writeTo( outPutStream );
  87. // 返回图片名称
  88. return picName;
  89. }
  90. }
  91. } catch( Exception e ) {
  92. e.printStackTrace();
  93. } finally {
  94. try {
  95. // 关闭response
  96. if ( response != null ) {
  97. response.close();
  98. }
  99. } catch( IOException e ) {
  100. e.printStackTrace();
  101. }
  102. }
  103. // 如果下载失败,返回空字符串
  104. return "";
  105. }
  106. // 设置请求信息
  107. private RequestConfig getConfig() {
  108. RequestConfig config = RequestConfig.custom()
  109. .setConnectTimeout( 1000 ) // 创建连接的最长时间
  110. .setConnectionRequestTimeout( 500 ) // 获取连接的最长时间
  111. .setSocketTimeout( 10000 ) // 数据传输最长时间
  112. .build();
  113. return config;
  114. }
  115. }

(3)dao层,service层以及实体类

  1. public interface ItemDao extends JpaRepository<Item, Long> {
  2. }
  3. @Service
  4. public class ItemService {
  5. @Autowired
  6. private ItemDao itemDao;
  7. @Transactional
  8. public void save( Item item ) {
  9. this.itemDao.save( item );
  10. }
  11. public List<Item> findAll( Item item ) {
  12. // 声明查询条件
  13. Example<Item> example = Example.of( item );
  14. // 根据查询条件进行查询数据
  15. List<Item> list = this.itemDao.findAll( example );
  16. return list;
  17. }
  18. }
  19. @Entity
  20. @Table( name = "jd_item" )
  21. @Data
  22. public class Item {
  23. @Id
  24. @GeneratedValue( strategy = GenerationType.IDENTITY )
  25. private Long id;
  26. private Long spu;
  27. private Long sku;
  28. private String title;
  29. private Double price;
  30. private String pic;
  31. private String url;
  32. private Date created;
  33. private Date updated;
  34. }

(4)定时任务

  1. @Component
  2. public class ItemTask {
  3. @Autowired
  4. private HttpUtils httpUtils;
  5. @Autowired
  6. private ItemService itemService;
  7. @Autowired
  8. private static final ObjectMapper MAPPER = new ObjectMapper();
  9. // 当下载任务完成后,间隔多长时间进行下一次的任务
  10. @Scheduled( fixedDelay = 100 * 1000 )
  11. public void itemTask() throws Exception {
  12. // 声明需要解析的初始地址
  13. String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=1.def.0.V13--12%2C20%2C&wq=shouji&cid2=653&cid3=655&s=59&click=0&page=";
  14. // 遍历页面
  15. for( int i = 1; i < 10; i = i + 2 ) {
  16. String html = httpUtils.doGetHtml( url + i );
  17. // 解析页面,获取商品数据并存储
  18. this.parse( html );
  19. }
  20. System.out.println( "手机数据抓取完成!" );
  21. }
  22. // 解析页面,获取商品数据并存储
  23. private void parse( String html ) throws Exception {
  24. // 解析html,获取dom对象
  25. Document doc = Jsoup.parse( html );
  26. // 获取商品列表
  27. Elements spuEles = doc.select( "div#J_goodsList>ul>li" );
  28. for( Element spuEle : spuEles ) {
  29. // 获取spu
  30. long spu = Long.parseLong( spuEle.attr( "data-spu" ) );
  31. // 获取sku列表
  32. Elements skuEles = spuEle.select( "li.ps-item" );
  33. for( Element skuEle : skuEles ) {
  34. // 获取sku
  35. long sku = Long.parseLong( skuEle.select( "[data-sku]" )
  36. .attr( "data-sku" ) );
  37. // 根据sku查询商品数据
  38. Item item = new Item();
  39. item.setSku( sku );
  40. List<Item> list = this.itemService.findAll( item );
  41. if ( list.size() > 0 ) {
  42. // 如果商品存在,就进行下一个循环,该商品不保存,因为已存在
  43. continue;
  44. }
  45. // 设置商品的spu
  46. item.setSpu( spu );
  47. // 获取商品详情的url
  48. String itemUrl = "https://item.jd.com/" + sku + ".html";
  49. item.setUrl( itemUrl );
  50. // 获取商品的图片
  51. String picUrl = "https:" + skuEle.select( "img[data-sku]" )
  52. .first()
  53. .attr( "data-lazy-img" );
  54. picUrl = picUrl.replace( "/n9", "/n1" );
  55. String picName = this.httpUtils.doGetImage( picUrl );
  56. item.setPic( picName );
  57. // 获取商品的价格
  58. String priceJson = this.httpUtils.doGetHtml( "https://p.3.cn/prices/mgets?skuIds=J_" + sku );
  59. double price = MAPPER.readTree( priceJson )
  60. .get( 0 )
  61. .get( "p" )
  62. .asDouble();
  63. item.setPrice( price );
  64. // 获取商品的标题
  65. String itemInfo = this.httpUtils.doGetHtml( item.getUrl() );
  66. String title = Jsoup.parse( itemInfo )
  67. .select( "div.sku-name" )
  68. .text();
  69. item.setTitle( title );
  70. item.setCreated( new Date() );
  71. item.setUpdated( item.getCreated() );
  72. // 保存商品数据到数据库
  73. this.itemService.save( item );
  74. }
  75. }
  76. }
  77. }

注意事项,一开始我httpclient没有添加header,导致获取到的html都是跳转到登录页面,后面模拟浏览器之后OK了

发表评论

表情:
评论列表 (有 0 条评论,480人围观)

还没有评论,来说两句吧...

相关阅读