首页 > 其他 > 详细

顶会热词统计

时间:2020-06-10 09:22:18      阅读:62      评论:0      收藏:0      [点我收藏+]

爬取CVPR数据代码

技术分享图片
import re
import requests
import urllib.request
import os
import argparse
 
parser = argparse.ArgumentParser(description="test")
parser.add_argument(--keyword,type=str,default=detection)  #传参匹配我们想要查找论文的关键字
args = parser.parse_args()
 
# get web context
r = requests.get(http://openaccess.thecvf.com/CVPR2018.py)
data = r.text
# find all pdf links
link_list = re.findall(r"(?<=href=\").+?pdf(?=\">pdf)|(?<=href=\‘).+?pdf(?=\">pdf)" ,data)
name_list = re.findall(r"(?<=href=\").+?2018_paper.html\">.+?</a>" ,data)
 
cnt = 1
num = len(link_list)
# your local path to download pdf files
localDir = ./CVPR2018/{}/.format(args.keyword)
if not os.path.exists(localDir):
    os.makedirs(localDir)
while cnt < num:
    url = link_list[cnt]
    # seperate file name from url links
    file_name = name_list[cnt].split(<)[0].split(>)[1]
    # to avoid some illegal punctuation in file name
    file_name = file_name.replace(:,_)
    file_name = file_name.replace(\",_)
    file_name = file_name.replace(?,_)
    file_name = file_name.replace(/,_)
    file_name = file_name.replace( ,_)
    search_list = file_name.split(_)
    search_pattern = re.compile(r{}.format(args.keyword),re.IGNORECASE)
 
    download_next_paper = True
 
    # print([True for i in search_list if search_pattern.findall(i)])
    if ([True for i in search_list if search_pattern.findall(i)]):
        download_next_paper = False
 
    if download_next_paper:
        cnt = cnt + 1
        continue
 
    file_path = localDir + file_name + .pdf
    if os.path.exists(file_path):
        print(File 【{}.pdf】 exists,skip downloading..format(file_name))
        cnt = cnt + 1
        continue
    else:
        # download pdf files
        print([+str(cnt)+/+str(num)+"]  Downloading -> "+file_path)
        try:
            urllib.request.urlretrieve(http://openaccess.thecvf.com/+url,file_path)
        except :
            cnt = cnt + 1
            continue
        cnt = cnt + 1
print("all download finished")
test.py

展示统计结果jsp

技术分享图片
<%@ page language="java" contentType="text/html; charset=UTF-8"
    pageEncoding="UTF-8"%>
<%@ taglib uri="http://java.sun.com/jsp/jstl/core" prefix="c"%>
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>顶会热词统计</title>
<script src=https://cdn.bootcss.com/echarts/3.7.0/echarts.simple.js></script>
<script src="./js/echarts-wordcloud.js"></script>
<script src="./js/jquery-1.11.3.min.js"></script>
<style>
html, body, #main {
    width: 100%;
    height: 100%;
    margin: 0;
}
</style>
</head>
<body>
    <div id="main"></div>
    <div>
        <table class="table table-hover">
            <thead>
                <tr>
                    <td style="font-size: 20px;">论文链接</td>
                </tr>
            </thead>
            <tbody>
                <c:forEach items="${dataList}" var="data" varStatus="vs">
                    <tr>
                        <td><a href="${data.lianjie}">${data.title}</a></td>
                    </tr>
                </c:forEach>
            </tbody>
        </table>
    </div>
    <script>
        var chart = echarts.init(document.getElementById(main));
        var postURL = "/PaperData/getData";
        var mydata = new Array();
        $.ajaxSettings.async = false;
        $.post(postURL, {}, function(rs) {
            var dataList = JSON.parse(rs);
            for (var i = 0; i < dataList.length; i++) {
                var d = {};
                d[name] = dataList[i].name;
                d[value] = dataList[i].value;
                mydata.push(d);
            }
        });
        $.ajaxSettings.async = true;
        var option = {
            tooltip : {},
            series : [ {
                type : wordCloud,
                gridSize : 2,
                sizeRange : [ 20, 50 ],
                rotationRange : [ -90, 90 ],
                shape : pentagon,
                width : 800,
                height : 600,
                drawOutOfBound : false,
                textStyle : {
                    normal : {
                        color : function() {
                            return rgb(
                                    + [ Math.round(Math.random() * 160),
                                            Math.round(Math.random() * 160),
                                            Math.round(Math.random() * 160) ]
                                            .join(,) + );
                        }
                    },
                    emphasis : {
                        shadowBlur : 10,
                        shadowColor : #333
                    }
                },
                data : mydata
            } ]
        };
        chart.setOption(option);
        chart.on(click, function(params) {
            var url = "clickFunction?name=" + params.name;
            window.location.href = url;
        });
    </script>
</body>
</html>
count.jsp

技术分享图片

技术分享图片

 

顶会热词统计

原文:https://www.cnblogs.com/chenaiiu/p/13082429.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!