java实现爬虫爬取京东手机页面
java实现爬虫利用httpclient获取页面数据,再用jsoup解析获取数据,在此页面的分析不作过多赘述,读者自行分析.
1.首页输入手机,观察url参数,将其中”&page=”提取到最后,便于拼接页码参数;
2.查看网页源代码,观看到商品列表标签,所以第一步,获取商品列表,Elements spuEles = doc.select( “div#J_goodsList>ul>li” );第二步,获取spu,long spu = Long.parseLong( spuEle.attr( “data-spu” ) );第三步,获取sku列表,Elements skuEles = spuEle.select( “li.ps-item” );第四步,获取单个sku,long sku = Long.parseLong( skuEle.select( “[data-sku]“ ).attr( “data-sku” ) );之后的操作就都是通过这个sku来操作的;
3.代码:
(1)添加依赖
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.4</version>
</dependency>
<!-- jpa -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-jpa</artifactId>
</dependency>
数据库等其他依赖就不贴了;
(2)工具类封装
@Component
public class HttpUtils {
private PoolingHttpClientConnectionManager cm;
public HttpUtils() {
this.cm = new PoolingHttpClientConnectionManager();
// 设置最大连接数
this.cm.setMaxTotal( 100 );
// 设置每个主机的最大连接数
this.cm.setDefaultMaxPerRoute( 10 );
}
// 根据请求地址下载页面数据
public String doGetHtml( String url ) {
// 获取httpclient对象
CloseableHttpClient httpClient = HttpClients.custom()
.setConnectionManager( this.cm )
.build();
// 设置httpGet请求对象,设置url地址
HttpGet httpGet = new HttpGet( url );
httpGet.addHeader( "user-agent", "Mozilla/5.0" );
// 设置请求信息
httpGet.setConfig( this.getConfig() );
CloseableHttpResponse response = null;
try {
// 使用httpclient发起请求,回去响应
response = httpClient.execute( httpGet );
// 解析响应.返回结果
if ( response.getStatusLine()
.getStatusCode() == 200 ) {
// 判断响应体Entity是否不为空,如果不为空就可以使用EntityUtils
if ( response.getEntity() != null ) {
String content = EntityUtils.toString( response.getEntity(), "utf-8" );
return content;
}
}
} catch( Exception e ) {
e.printStackTrace();
} finally {
try {
// 关闭response
if ( response != null ) {
response.close();
}
} catch( IOException e ) {
e.printStackTrace();
}
}
// 返回空字符串
return "";
}
/**
* //下载图片名称
*
* @param url
* @return
*/
public String doGetImage( String url ) {
// 获取httpclient对象
CloseableHttpClient httpClient = HttpClients.custom()
.setConnectionManager( this.cm )
.build();
// 设置httpGet请求对象,设置url地址
HttpGet httpGet = new HttpGet( url );
httpGet.addHeader( "user-agent", "Mozilla/5.0" );
// 设置请求信息
httpGet.setConfig( this.getConfig() );
CloseableHttpResponse response = null;
try {
// 使用httpclient发起请求,回去响应
response = httpClient.execute( httpGet );
// 解析响应.返回结果
if ( response.getStatusLine()
.getStatusCode() == 200 ) {
// 判断响应体Entity是否不为空,如果不为空就可以使用EntityUtils
if ( response.getEntity() != null ) {
// 下载图片
// 获取图片的后缀
String extName = url.substring( url.lastIndexOf( "." ) );
// 创建图片名,重命名图片
String picName = UUID.randomUUID()
.toString() +
extName;
// 下载图片
// 声明OutPutStream
OutputStream outPutStream = new FileOutputStream( new File( "D:\\driver\\images\\" + picName ) );
response.getEntity()
.writeTo( outPutStream );
// 返回图片名称
return picName;
}
}
} catch( Exception e ) {
e.printStackTrace();
} finally {
try {
// 关闭response
if ( response != null ) {
response.close();
}
} catch( IOException e ) {
e.printStackTrace();
}
}
// 如果下载失败,返回空字符串
return "";
}
// 设置请求信息
private RequestConfig getConfig() {
RequestConfig config = RequestConfig.custom()
.setConnectTimeout( 1000 ) // 创建连接的最长时间
.setConnectionRequestTimeout( 500 ) // 获取连接的最长时间
.setSocketTimeout( 10000 ) // 数据传输最长时间
.build();
return config;
}
}
(3)dao层,service层以及实体类
public interface ItemDao extends JpaRepository<Item, Long> {
}
@Service
public class ItemService {
@Autowired
private ItemDao itemDao;
@Transactional
public void save( Item item ) {
this.itemDao.save( item );
}
public List<Item> findAll( Item item ) {
// 声明查询条件
Example<Item> example = Example.of( item );
// 根据查询条件进行查询数据
List<Item> list = this.itemDao.findAll( example );
return list;
}
}
@Entity
@Table( name = "jd_item" )
@Data
public class Item {
@Id
@GeneratedValue( strategy = GenerationType.IDENTITY )
private Long id;
private Long spu;
private Long sku;
private String title;
private Double price;
private String pic;
private String url;
private Date created;
private Date updated;
}
(4)定时任务
@Component
public class ItemTask {
@Autowired
private HttpUtils httpUtils;
@Autowired
private ItemService itemService;
@Autowired
private static final ObjectMapper MAPPER = new ObjectMapper();
// 当下载任务完成后,间隔多长时间进行下一次的任务
@Scheduled( fixedDelay = 100 * 1000 )
public void itemTask() throws Exception {
// 声明需要解析的初始地址
String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=1.def.0.V13--12%2C20%2C&wq=shouji&cid2=653&cid3=655&s=59&click=0&page=";
// 遍历页面
for( int i = 1; i < 10; i = i + 2 ) {
String html = httpUtils.doGetHtml( url + i );
// 解析页面,获取商品数据并存储
this.parse( html );
}
System.out.println( "手机数据抓取完成!" );
}
// 解析页面,获取商品数据并存储
private void parse( String html ) throws Exception {
// 解析html,获取dom对象
Document doc = Jsoup.parse( html );
// 获取商品列表
Elements spuEles = doc.select( "div#J_goodsList>ul>li" );
for( Element spuEle : spuEles ) {
// 获取spu
long spu = Long.parseLong( spuEle.attr( "data-spu" ) );
// 获取sku列表
Elements skuEles = spuEle.select( "li.ps-item" );
for( Element skuEle : skuEles ) {
// 获取sku
long sku = Long.parseLong( skuEle.select( "[data-sku]" )
.attr( "data-sku" ) );
// 根据sku查询商品数据
Item item = new Item();
item.setSku( sku );
List<Item> list = this.itemService.findAll( item );
if ( list.size() > 0 ) {
// 如果商品存在,就进行下一个循环,该商品不保存,因为已存在
continue;
}
// 设置商品的spu
item.setSpu( spu );
// 获取商品详情的url
String itemUrl = "https://item.jd.com/" + sku + ".html";
item.setUrl( itemUrl );
// 获取商品的图片
String picUrl = "https:" + skuEle.select( "img[data-sku]" )
.first()
.attr( "data-lazy-img" );
picUrl = picUrl.replace( "/n9", "/n1" );
String picName = this.httpUtils.doGetImage( picUrl );
item.setPic( picName );
// 获取商品的价格
String priceJson = this.httpUtils.doGetHtml( "https://p.3.cn/prices/mgets?skuIds=J_" + sku );
double price = MAPPER.readTree( priceJson )
.get( 0 )
.get( "p" )
.asDouble();
item.setPrice( price );
// 获取商品的标题
String itemInfo = this.httpUtils.doGetHtml( item.getUrl() );
String title = Jsoup.parse( itemInfo )
.select( "div.sku-name" )
.text();
item.setTitle( title );
item.setCreated( new Date() );
item.setUpdated( item.getCreated() );
// 保存商品数据到数据库
this.itemService.save( item );
}
}
}
}
注意事项,一开始我httpclient没有添加header,导致获取到的html都是跳转到登录页面,后面模拟浏览器之后OK了
还没有评论,来说两句吧...