首页 > 编程语言 > 详细

Java代码---实现爬取腾讯新闻

时间:2020-03-17 15:50:54      阅读:184      评论:0      收藏:0      [点我收藏+]

环境准备:

<dependency>
<groupId>com.alibaba</groupId>
<artifactId>druid</artifactId>
<version>1.1.21</version>
</dependency>

<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.5</version>
</dependency>

<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-jdbc</artifactId>
<version>5.2.2.RELEASE</version>
</dependency>

<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.47</version>
</dependency>

定义pojo接收

private static final long serialVersionUID = 1L;
private int id;
private String title;
private String intro;
private String url;
private String source;
private Date publishTime;

 

 

 

 代码爬取数据:

static JdbcTemplate jdbcTemplate = null;
public static void main(String[] args) throws IOException, ParseException {


//加载外部属性文件
Properties properties = new Properties();
InputStream inputStream = new FileInputStream(new File("src/main/resources/db.properties"));
properties.load(inputStream);
//获得数据库属性
String driver = properties.getProperty("jdbc.driverClass");
String url = properties.getProperty("jdbc.url");
String username = properties.getProperty("jdbc.username");
String password = properties.getProperty("jdbc.password");
// System.out.println(driver+"=="+url+"=="+username+"=="+password);

DruidDataSource dataSource = new DruidDataSource();
dataSource.setDriverClassName(driver);
dataSource.setUrl(url);
dataSource.setUsername(username);
dataSource.setPassword(password);

jdbcTemplate =new JdbcTemplate(dataSource);

//设置起始页
int page = 1;
while (true) {
String urlTencent ="https://pacaio.match.qq.com/irs/rcd?cid=135&token=6e92c215fb08afa901ac31eca115a34f&ext=world&page="+page+"&expIds=&callback=__jp4";
//确定路径
//String urlTencent = "https://pacaio.match.qq.com/irs/rcd?cid=89&token=4d4e2946f92c5708f32141479596d72e&id=&ext=bj&page="+page+"&expIds=&callback=__jp0";

CloseableHttpClient httpClient = HttpClients.createDefault();

HttpGet httpGet = new HttpGet(urlTencent);
httpGet.setHeader("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36");
CloseableHttpResponse httpResponse = httpClient.execute(httpGet);
int statusCode = httpResponse.getStatusLine().getStatusCode();
if (statusCode==200) {
HttpEntity httpEntity = httpResponse.getEntity();
Gson gson = new Gson();
//转换
String html = EntityUtils.toString(httpEntity);
//得到json
String json = parseJson(html);
//转换成map
Map map = gson.fromJson(json, Map.class);
//判断有多少数据,然后退出循环
Object num = map.get("datanum");
String nums = num.toString();
Double double1 = Double.parseDouble(nums);
int number = double1.intValue();
if (number==0) {
break;
}

//得到页面的data
@SuppressWarnings("unchecked")
List<Map> list = (List<Map>) map.get("data");
//遍历集合
for (Map map2 : list) {
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
Tencent tencent = new Tencent();
String title = map2.get("title").toString();
String intro = map2.get("intro").toString();
String turl = map2.get("url").toString();
String source = map2.get("source").toString();
Date publishTime = simpleDateFormat.parse(map2.get("publish_time").toString());


tencent.setTitle(title);
tencent.setUrl(turl);
tencent.setIntro(intro);
tencent.setSource(source);
tencent.setPublishTime(publishTime);

addNews(tencent);

}

}
page++;

}



}


public static void addNews(Tencent tencent) {
String sql = "insert into t_tencent (title,intro,url,source,publish_time) values (?,?,?,?,?)";
jdbcTemplate.update(sql,new Object[] {tencent.getTitle(),tencent.getIntro(),tencent.getUrl(),tencent.getSource(),tencent.getPublishTime()});

}


public static String parseJson(String data) {
int start = data.indexOf("(");
int end = data.lastIndexOf(")");
String html = data.substring(start+1, end);
return html;

}

 

结果如下:

技术分享图片

 

Java代码---实现爬取腾讯新闻

原文:https://www.cnblogs.com/suspring/p/12510826.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!