1、完成论文的题目、摘要、关键词、原文链接四项内容爬取;
2、存储到本地数据库中;
3、按照题目、关键词分类统计得到最热的十个领域方向;
4、热词越多,在热词云中显示的就越大,还要将热词与文章链接,点击热词云中的热词可以找到与之对应的文章题目;
1、爬取数据:
/** * 业务 */ public class LW { public static void main(String[] args) throws SQLException { LWDao dao = new LWDao(); //dao.deleteAll(); String s = HttpUtil.setUrl("http://openaccess.thecvf.com/CVPR2019.py"); List<Paper> lw = JsoupNewsUtil.lw(s); for (int i = 0; i < lw.size(); i++) { dao.add(lw.get(i)); } } } /** * 工具 */ public class HttpUtil { /** * 返回json * @param setUrl * @return */ public static String setUrl(String setUrl){ try { URL url = new URL(setUrl); HttpURLConnection conn = (HttpURLConnection)url.openConnection(); conn.setConnectTimeout(5000); conn.setRequestMethod("GET"); int responseCode = conn.getResponseCode(); if (responseCode == HttpURLConnection.HTTP_OK){ InputStream inputStream = conn.getInputStream(); InputStreamReader inputStreamReader = new InputStreamReader(inputStream); BufferedReader reader = new BufferedReader(inputStreamReader); StringBuffer stringBuffer = new StringBuffer(); String string = reader.readLine(); while (string != null) { stringBuffer.append(string); string = reader.readLine(); } return stringBuffer.toString(); } } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return ""; } } public class JsoupNewsUtil { public static List<Paper> lw(String data){ List<Paper> LWList = new ArrayList<Paper>(); Document document = Jsoup.parse(data); //获取论文的内容 Element content = document.getElementById("content"); Elements dl = content.getElementsByTag("dl"); Elements dt = dl.first().getElementsByTag("dt"); for (Element element : dt) { Paper paper = new Paper(); Elements a = element.getElementsByTag("a"); String tm = a.first().text(); String ywlj = a.first().attr("href"); String jy = jy(ywlj); paper.setTm(tm); paper.setYwlj("http://openaccess.thecvf.com/"+ywlj); paper.setZy(jy); LWList.add(paper); if (LWList.size()>99){ break; } } return LWList; } public static String jy(String url){ String s = HttpUtil.setUrl("http://openaccess.thecvf.com/"+url); Document document = Jsoup.parse(s); Element anAbstract = document.getElementById("abstract"); String str = ""; if (anAbstract != null) { str = anAbstract.text().trim(); } return str; } }
2、数据存储
public class DBUtils { private static DataSource dataSource = new ComboPooledDataSource(); private static ThreadLocal<Connection> tl = new ThreadLocal<Connection>(); // 直接可以获取一个连接池 public static DataSource getDataSource() { return dataSource; } public static Connection getConnection() throws SQLException{ return dataSource.getConnection(); } // 获取连接对象 public static Connection getCurrentConnection() throws SQLException { Connection con = tl.get(); if (con == null) { con = dataSource.getConnection(); tl.set(con); } return con; } // 开启事务 public static void startTransaction() throws SQLException { Connection con = getCurrentConnection(); if (con != null) { con.setAutoCommit(false); } } // 事务回滚 public static void rollback() throws SQLException { Connection con = getCurrentConnection(); if (con != null) { con.rollback(); } } // 提交并且 关闭资源及从ThreadLocall中释放 public static void commitAndRelease() throws SQLException { Connection con = getCurrentConnection(); if (con != null) { con.commit(); // 事务提交 con.close();// 关闭资源 tl.remove();// 从线程绑定中移除 } } // 关闭资源方法 public static void closeConnection() throws SQLException { Connection con = getCurrentConnection(); if (con != null) { con.close(); } } public static void closeStatement(Statement st) throws SQLException { if (st != null) { st.close(); } } public static void closeResultSet(ResultSet rs) throws SQLException { if (rs != null) { rs.close(); } } } public class LWDao { /** * * @param * @return * @throws SQLException */ public boolean add(Paper paper) throws SQLException { QueryRunner qr = new QueryRunner(DBUtils.getDataSource()); String sql = "insert into paper (tm,zy,gjc,ywlj) " + "values(?,?,?,?)"; int update = qr.update(sql,paper.getTm(),paper.getZy(),paper.getGjc(),paper.getYwlj()); if (update > 0) { return true; } else { return false; } } public List<Paper> search() throws SQLException { QueryRunner qr = new QueryRunner(DBUtils.getDataSource()); String sql = "select * from paper"; List<Paper> query = qr.query(sql, new BeanListHandler<Paper>(Paper.class)); return query; } public List<LunWen> search_() throws SQLException { QueryRunner qr = new QueryRunner(DBUtils.getDataSource()); String sql = "select * from cvpr"; List<LunWen> query = qr.query(sql, new BeanListHandler<LunWen>(LunWen.class)); return query; } public boolean gjc(String gjc,int id) throws SQLException { QueryRunner qr = new QueryRunner(DBUtils.getDataSource()); String sql = "update paper set fjc = ? where id=? "; int n = qr.update(sql, gjc,id); if (n > 0) { return true; } else { return false; } } /** * * @return * @throws SQLException */ public boolean deleteAll() throws SQLException { QueryRunner qr =new QueryRunner(DBUtils.getDataSource()); String sql="delete from paper "; int n = qr.update(sql); if (n > 0) { return true; } else { return false; } } }
3、使用echart 的 wordCloud 实现热词云。
<%@ page language="java" contentType="text/html; charset=UTF-8" pageEncoding="UTF-8"%> <%@ taglib uri="http://java.sun.com/jsp/jstl/core" prefix="c"%> <!DOCTYPE html> <html> <head> <meta charset="UTF-8"> <title>Insert title here</title> <link rel="stylesheet" href="css/bootstrap.min.css" type="text/css" /> <script src="js/jquery-1.11.3.min.js" type="text/javascript"></script> <script type="text/javascript" src="js/echarts.min.js"></script> <script type="text/javascript" src="js/china.js"></script> <script src="js/bootstrap.min.js" type="text/javascript"></script> <script src=‘https://cdn.bootcss.com/echarts/3.7.0/echarts.simple.js‘></script> <script src=‘js/echarts-wordcloud.js‘></script> </head> <body> <div id="main" style="width: 100%;height: 400px"></div> <div> <table class="table" style="width: 100%;align-content: center;" > <tr> <th align="center">论文连接</th> </tr> <c:forEach var="item" items="${list}"> <tr> <td><a href="${item.lianjie }">${item.title}</a></td> </tr> </c:forEach> </table> </div> <script> var chart = echarts.init(document.getElementById(‘main‘)); var dt; $.ajax({ url : "PaperServlet_", async : false, type : "POST", success : function(data) { dt = data; // alert(dt[0].title); }, error : function() { alert("请求失败"); }, dataType : "json" }); var mydata = new Array(0); for (var i = 0; i < dt.length; i++) { var d = {}; d["name"] = dt[i].name; //alert(dt[i].name); d["value"] = dt[i].value; mydata.push(d); } var option = { tooltip: {}, series: [ { type: ‘wordCloud‘, gridSize: 2, sizeRange: [20, 50], rotationRange: [-90, 90], shape: ‘pentagon‘, width: 600, height: 300, drawOutOfBound: true, textStyle: { normal: { color: function () { return ‘rgb(‘ + [ Math.round(Math.random() * 160), Math.round(Math.random() * 160), Math.round(Math.random() * 160) ].join(‘,‘) + ‘)‘; } }, emphasis: { shadowBlur: 10, shadowColor: ‘#333‘ } }, data: mydata } ] }; chart.setOption(option); chart.on(‘click‘, function (params) { var url = "ClickServlet?geunjian=" + params.name; window.location.href = url; }); window.onresize = chart.resize; </script> </body> </html>
4、热词集合的计算和传递
package com.me.servlet; import java.io.IOException; import java.sql.SQLException; import java.util.ArrayList; import java.util.List; import javax.servlet.ServletException; import javax.servlet.annotation.WebServlet; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import com.google.gson.Gson; import com.me.dao.LWDao; import com.me.domain.LunWen; import com.me.domain.Tu; @WebServlet("/PaperServlet_") public class PaperServlet_ extends HttpServlet { private static final long serialVersionUID = 1L; public PaperServlet_() { super(); } protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { response.setHeader("content-type", "text/html;charset=UTF-8"); response.setCharacterEncoding("UTF-8"); LWDao dao = new LWDao(); List<LunWen> list = new ArrayList<LunWen>(); List<Tu> list_tu = new ArrayList<Tu>(); String [] str = new String[10000]; String [] str_ = new String[10000]; int [] b = new int[10000]; int num = 0; int length1 = 0; try { list = dao.search_(); } catch (SQLException e) { e.printStackTrace(); } for(int i=0;i<list.size();i++) { if(list.get(i).getLianjie()!=null) { String ss = list.get(i).getLianjie().substring(6,list.get(i).getLianjie().length()); list.get(i).setLianjie("http://openaccess.thecvf.com/"+ss); } String[] split = list.get(i).getGuanjian().split(" "); for(int j=0;j<split.length;j++) { str[num++] = split[j]; } } for(int k=0;k<num;k++) { b[k]=0; } str_[0]=str[0]; int tt=1; Boolean rt=true; for(int i=1;i<num;i++) { rt=false; for(int j=0;j<tt;j++) { if(str[i].equals(str_[j])) { rt=true; break; } } if(!rt) { str_[tt]=str[i]; tt++; } } length1=tt; for(int i=0;i<length1;i++) { for(int j=0;j<num;j++) { if(str_[i].equals(str[j])) { b[i]++; } } } int t3=0; int t2=0; String sr=""; for(int i=0;i<length1-1;i++) { t3=i; for(int j=i+1;j<length1;j++) { if(b[t3]<b[j]) { t3=j; } } if(t3!=i) { t2=b[i]; b[i]=b[t3]; b[t3]=t2; sr=str_[i]; str_[i]=str_[t3]; str_[t3]=sr; } } for(int i=0;i<100;i++) { Tu tu = new Tu(); tu.name=str_[i]; tu.value= b[i]; list_tu.add(tu); } Gson gson = new Gson(); String json = gson.toJson(list_tu); response.getWriter().write(json); } protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { // TODO Auto-generated method stub doGet(request, response); } }
5、论文连接列表数据准备
package com.me.servlet; import java.io.IOException; import java.sql.SQLException; import java.util.ArrayList; import java.util.List; import javax.servlet.ServletException; import javax.servlet.annotation.WebServlet; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import com.me.dao.LWDao; import com.me.domain.LunWen; import com.me.domain.Tu; /** * Servlet implementation class LunServlet */ @WebServlet("/LunServlet") public class LunServlet extends HttpServlet { private static final long serialVersionUID = 1L; /** * @see HttpServlet#HttpServlet() */ public LunServlet() { super(); // TODO Auto-generated constructor stub } /** * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response) */ protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { response.setHeader("content-type", "text/html;charset=UTF-8"); response.setCharacterEncoding("UTF-8"); LWDao dao = new LWDao(); List<LunWen> list = new ArrayList<LunWen>(); try { list = dao.search_(); } catch (SQLException e) { e.printStackTrace(); } for(int i=0;i<list.size();i++) { if(list.get(i).getLianjie()!=null) { String ss = list.get(i).getLianjie().substring(6,list.get(i).getLianjie().length()); list.get(i).setLianjie("http://openaccess.thecvf.com/"+ss); } } request.setAttribute("list",list); request.getRequestDispatcher("lw.jsp").forward(request, response); } /** * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response) */ protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { // TODO Auto-generated method stub doGet(request, response); } }
6、点击热词后包含热词的希望列表
package com.me.servlet; import java.io.IOException; import java.sql.SQLException; import java.util.ArrayList; import java.util.List; import javax.servlet.ServletException; import javax.servlet.annotation.WebServlet; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import com.me.dao.LWDao; import com.me.domain.LunWen; /** * Servlet implementation class ClickServlet */ @WebServlet("/ClickServlet") public class ClickServlet extends HttpServlet { private static final long serialVersionUID = 1L; LWDao dao = new LWDao(); public ClickServlet() { super(); // TODO Auto-generated constructor stub } protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { String geunjian = request.getParameter("geunjian"); System.out.println(geunjian); List<LunWen> guan = new ArrayList<LunWen>(); try { guan = dao.login(geunjian); } catch (SQLException e) { e.printStackTrace(); } for(int i=0;i<guan.size();i++) { if(guan.get(i).getLianjie()!=null) { String ss = guan.get(i).getLianjie().substring(6,guan.get(i).getLianjie().length()); guan.get(i).setLianjie("http://openaccess.thecvf.com/"+ss); } } request.setAttribute("list", guan); System.out.println(guan.size()); request.getRequestDispatcher("lw.jsp").forward(request, response); } /** * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response) */ protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { // TODO Auto-generated method stub doGet(request, response); } }
原文:https://www.cnblogs.com/20183544-wangzhengshuai/p/12702137.html