java实现爬虫爬取京东手机页面-向日葵屋

java实现爬虫利用httpclient获取页面数据,再用jsoup解析获取数据,在此页面的分析不作过多赘述,读者自行分析.

1.首页输入手机,观察url参数,将其中”&page=”提取到最后,便于拼接页码参数;

2.查看网页源代码,观看到商品列表标签,所以第一步,获取商品列表,Elements spuEles = doc.select( “div#J_goodsList>ul>li” );第二步,获取spu,long spu = Long.parseLong( spuEle.attr( “data-spu” ) );第三步,获取sku列表,Elements skuEles = spuEle.select( “li.ps-item” );第四步,获取单个sku,long sku = Long.parseLong( skuEle.select( “[data-sku]“ ).attr( “data-sku” ) );之后的操作就都是通过这个sku来操作的;

3.代码:

(1)添加依赖

<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.11.3</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
            <version>3.4</version>
        </dependency>
<!-- jpa -->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-data-jpa</artifactId>
        </dependency>

数据库等其他依赖就不贴了;

(2)工具类封装

@Component
public class HttpUtils {
    private PoolingHttpClientConnectionManager cm;
    public HttpUtils() {
        this.cm = new PoolingHttpClientConnectionManager();
        // 设置最大连接数
        this.cm.setMaxTotal( 100 );
        // 设置每个主机的最大连接数
        this.cm.setDefaultMaxPerRoute( 10 );
    }
    // 根据请求地址下载页面数据
    public String doGetHtml( String url ) {
        // 获取httpclient对象
        CloseableHttpClient httpClient = HttpClients.custom()
                                                    .setConnectionManager( this.cm )
                                                    .build();
        // 设置httpGet请求对象,设置url地址
        HttpGet httpGet = new HttpGet( url );
        httpGet.addHeader( "user-agent", "Mozilla/5.0" );
        // 设置请求信息
        httpGet.setConfig( this.getConfig() );
        CloseableHttpResponse response = null;
        try {
            // 使用httpclient发起请求,回去响应
            response = httpClient.execute( httpGet );
            // 解析响应.返回结果
            if ( response.getStatusLine()
                         .getStatusCode() == 200 ) {
                // 判断响应体Entity是否不为空,如果不为空就可以使用EntityUtils
                if ( response.getEntity() != null ) {
                    String content = EntityUtils.toString( response.getEntity(), "utf-8" );
                    return content;
                }
            }
        } catch( Exception e ) {
            e.printStackTrace();
        } finally {
            try {
                // 关闭response
                if ( response != null ) {
                    response.close();
                }
            } catch( IOException e ) {
                e.printStackTrace();
            }
        }
        // 返回空字符串
        return "";
    }
    /**
     * //下载图片名称
     * 
     * @param url
     * @return
     */
    public String doGetImage( String url ) {
        // 获取httpclient对象
        CloseableHttpClient httpClient = HttpClients.custom()
                                                    .setConnectionManager( this.cm )
                                                    .build();
        // 设置httpGet请求对象,设置url地址
        HttpGet httpGet = new HttpGet( url );
        httpGet.addHeader( "user-agent", "Mozilla/5.0" );
        // 设置请求信息
        httpGet.setConfig( this.getConfig() );
        CloseableHttpResponse response = null;
        try {
            // 使用httpclient发起请求,回去响应
            response = httpClient.execute( httpGet );
            // 解析响应.返回结果
            if ( response.getStatusLine()
                         .getStatusCode() == 200 ) {
                // 判断响应体Entity是否不为空,如果不为空就可以使用EntityUtils
                if ( response.getEntity() != null ) {
                    // 下载图片
                    // 获取图片的后缀
                    String extName = url.substring( url.lastIndexOf( "." ) );
                    // 创建图片名,重命名图片
                    String picName = UUID.randomUUID()
                                         .toString() +
                                     extName;
                    // 下载图片
                    // 声明OutPutStream
                    OutputStream outPutStream = new FileOutputStream( new File( "D:\\driver\\images\\" + picName ) );
                    response.getEntity()
                            .writeTo( outPutStream );
                    // 返回图片名称
                    return picName;
                }
            }
        } catch( Exception e ) {
            e.printStackTrace();
        } finally {
            try {
                // 关闭response
                if ( response != null ) {
                    response.close();
                }
            } catch( IOException e ) {
                e.printStackTrace();
            }
        }
        // 如果下载失败,返回空字符串
        return "";
    }
    // 设置请求信息
    private RequestConfig getConfig() {
        RequestConfig config = RequestConfig.custom()
                                            .setConnectTimeout( 1000 ) // 创建连接的最长时间
                                            .setConnectionRequestTimeout( 500 ) // 获取连接的最长时间
                                            .setSocketTimeout( 10000 ) // 数据传输最长时间
                                            .build();
        return config;
    }
}

(3)dao层,service层以及实体类

public interface ItemDao extends JpaRepository<Item, Long> {
}
@Service
public class ItemService {
    @Autowired
    private ItemDao itemDao;
    @Transactional
    public void save( Item item ) {
        this.itemDao.save( item );
    }
    public List<Item> findAll( Item item ) {
        // 声明查询条件
        Example<Item> example = Example.of( item );
        // 根据查询条件进行查询数据
        List<Item> list = this.itemDao.findAll( example );
        return list;
    }
}
@Entity
@Table( name = "jd_item" )
@Data
public class Item {
    @Id
    @GeneratedValue( strategy = GenerationType.IDENTITY )
    private Long   id;
    private Long   spu;
    private Long   sku;
    private String title;
    private Double price;
    private String pic;
    private String url;
    private Date   created;
    private Date   updated;
}

(4)定时任务

@Component
public class ItemTask {
    @Autowired
    private HttpUtils                 httpUtils;
    @Autowired
    private ItemService               itemService;
    @Autowired
    private static final ObjectMapper MAPPER = new ObjectMapper();
    // 当下载任务完成后,间隔多长时间进行下一次的任务
    @Scheduled( fixedDelay = 100 * 1000 )
    public void itemTask() throws Exception {
        // 声明需要解析的初始地址
        String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=1.def.0.V13--12%2C20%2C&wq=shouji&cid2=653&cid3=655&s=59&click=0&page=";
        // 遍历页面
        for( int i = 1; i < 10; i = i + 2 ) {
            String html = httpUtils.doGetHtml( url + i );
            // 解析页面,获取商品数据并存储
            this.parse( html );
        }
        System.out.println( "手机数据抓取完成!" );
    }
    // 解析页面,获取商品数据并存储
    private void parse( String html ) throws Exception {
        // 解析html,获取dom对象
        Document doc = Jsoup.parse( html );
        // 获取商品列表
        Elements spuEles = doc.select( "div#J_goodsList>ul>li" );
        for( Element spuEle : spuEles ) {
            // 获取spu
            long spu = Long.parseLong( spuEle.attr( "data-spu" ) );
            // 获取sku列表
            Elements skuEles = spuEle.select( "li.ps-item" );
            for( Element skuEle : skuEles ) {
                // 获取sku
                long sku = Long.parseLong( skuEle.select( "[data-sku]" )
                                                 .attr( "data-sku" ) );
                // 根据sku查询商品数据
                Item item = new Item();
                item.setSku( sku );
                List<Item> list = this.itemService.findAll( item );
                if ( list.size() > 0 ) {
                    // 如果商品存在,就进行下一个循环,该商品不保存,因为已存在
                    continue;
                }
                // 设置商品的spu
                item.setSpu( spu );
                // 获取商品详情的url
                String itemUrl = "https://item.jd.com/" + sku + ".html";
                item.setUrl( itemUrl );
                // 获取商品的图片
                String picUrl = "https:" + skuEle.select( "img[data-sku]" )
                                                 .first()
                                                 .attr( "data-lazy-img" );
                picUrl = picUrl.replace( "/n9", "/n1" );
                String picName = this.httpUtils.doGetImage( picUrl );
                item.setPic( picName );
                // 获取商品的价格
                String priceJson = this.httpUtils.doGetHtml( "https://p.3.cn/prices/mgets?skuIds=J_" + sku );
                double price = MAPPER.readTree( priceJson )
                                     .get( 0 )
                                     .get( "p" )
                                     .asDouble();
                item.setPrice( price );
                // 获取商品的标题
                String itemInfo = this.httpUtils.doGetHtml( item.getUrl() );
                String title = Jsoup.parse( itemInfo )
                                    .select( "div.sku-name" )
                                    .text();
                item.setTitle( title );
                item.setCreated( new Date() );
                item.setUpdated( item.getCreated() );
                // 保存商品数据到数据库
                this.itemService.save( item );
            }
        }
    }
}