PSP2.1 | Personal Software Process Stages | 预估耗时(分钟) | 实际耗时(分钟) |
---|---|---|---|
Planning | 计划 | 10 | 10 |
? Estimate | ? 估计这个任务需要多少时间 | 10 | 10 |
Development | ? 开发 | 1210 | 1670 |
? Analysis | ? 需求分析 (包括学习新技术) | 400 | 500 |
? Design Spec | ? 生成设计文档 | 10 | 10 |
? Design Review | ? 设计复审 | 10 | 10 |
? Coding Standard | ? 代码规范 (为目前的开发制定合适的规范) | 10 | 10 |
? Design | ? 具体设计 | 20 | 20 |
? Coding | ? 具体编码 | 500 | 600 |
? Code Review | ? 代码复审 | 60 | 120 |
? Test | ? 测试(自我测试,修改代码,提交修改) | 200 | 400 |
Reporting | 报告 | 30 | 40 |
? Test Repor | ? 测试报告 | 10 | 10 |
? Size Measurement | ? 计算工作量 | 10 | 10 |
? Postmortem & Process Improvement Plan | ? 事后总结, 并提出过程改进计划 | 10 | 20 |
合计 | 1250 | 1710 |
<dependency>
<!-- jsoup HTML parser library @ http://jsoup.org/ -->
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
接下来是具体实现部分,思路是先爬取每个文章的链接,再逐个爬取每个链接,获取标题和摘要。
package cvpr;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**爬虫实现
*
* @author xyq
* @since 2018 10 10
* @version 1.0
*/
public class clawler {
/**爬取cvpr网站
*
* @return Map<String, String>
*
*/
public Map<String, String> getPaperInfo() {
Map<String, String> paperInfo = new HashMap<String, String>();
ArrayList<String> htmlLinks = new ArrayList<String>();
String head = "http://openaccess.thecvf.com/";//html链接前半部分
try
{
Document doc = null ;
doc = Jsoup.connect("http://openaccess.thecvf.com/CVPR2018.py#").get();
//Document document = Jsoup.parse(new File("C:/Users/zkpkhua/Desktop/yiibai-index.html"), "utf-8");
Elements links = doc.select("dt a[href]");
for (Element link : links)
{
//System.out.println("link : " + link.attr("href"));
htmlLinks.add(head+link.attr("href"));//得到html链接
//System.out.println(head+link.attr("href"));
}
}
catch (IOException e)
{
e.printStackTrace();
}
try
{
for (String htmlLink : htmlLinks) {
Document doc = null ;
doc = Jsoup.connect(htmlLink).get();
Elements titleInfo = doc.select("div#papertitle");
Elements abstractInfo = doc.select("div#abstract");
//System.out.println(titleInfo.text());
paperInfo.put(titleInfo.text(), abstractInfo.text());
}
}
catch (IOException e)
{
e.printStackTrace();
}
return paperInfo;
}
public void writeResult() throws Exception {
Map<String, String> paperInfo = new HashMap<String, String>();
clawler clawler = new clawler();
paperInfo = clawler.getPaperInfo();
String path = System.getProperty("user.dir")+"\\src\\main\\java\\cvpr\\result.txt";
File file = new File(path);
StringBuilder result = new StringBuilder("");
int count = 0;
FileWriter filewriter = new FileWriter(file.getAbsoluteFile());
//System.out.println("absolutely path:"+file.getAbsolutePath());
BufferedWriter bufferedWriter = new BufferedWriter(filewriter);
for (Entry<String, String> entry : paperInfo.entrySet()) {
result.append(count);
count++;
result.append("\r\n");
result.append("Title:"+entry.getKey());
result.append("\r\n");
result.append("Abstract:"+entry.getValue());
result.append("\r\n");
result.append("\r\n");
result.append("\r\n");
//System.out.println("title: "+entry.getKey());
//System.out.println("abstract: "+entry.getValue());
}
bufferedWriter.write(result.toString());
bufferedWriter.close();
}
}
UML类图
流程图
关键实现部分流程图
这次的关键在于新增的词组统计功能,一下是实现的流程图。
/**排序,按权重降序
*
*
*/
private static void sort(ArrayList<HashMap.Entry<String, Integer>> wordList) {
Collections.sort(wordList, new Comparator<HashMap.Entry<String, Integer>>() {
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
if (o1.getValue() < o2.getValue()) {
return 1;
} else {
if (o1.getValue().equals(o2.getValue())) {
if (o1.getKey().compareTo(o2.getKey()) > 0) {
return 1;
} else {
return -1;
}
} else {
return -1;
}
}
}
});
}
获取命令行参数
/*
* 这里先取默认值,因为不是所有参数都会被用户提供
*/
String in = " ";
String o = " ";
int m = 1;
int n = 10;
int w = 1;
long charNum = 0;
long lineNum = 0;
long wordNum = 0;
/*
* 设置一个offset变量,用来定位相关信息
*/
int optSetting = 0;
for (; optSetting < args.length; optSetting++) {
if ("-i".equals(args[optSetting])) {
in = args[++optSetting];
} else if ("-o".equals(args[optSetting])) {
o = args[++optSetting];
} else if ("-m".equals(args[optSetting])) {
m = Integer.parseInt(args[++optSetting]);
} else if ("-n".equals(args[optSetting])) {
n = Integer.parseInt(args[++optSetting]);
} else if ("-w".equals(args[optSetting])) {
w = Integer.parseInt(args[++optSetting]);
}
}
for(;i < str.length();i++) {
//大写字母转为小写字母
temp = str.charAt(i);
if ((temp >= 65) && (temp <= 90)) {
temp += 32;
}
if(state == 0) {
if ((temp >= 97) && (temp <= 122)) {
word.append(temp);
state = 1;
}
}else if (state == 1) {
if ((temp >= 97) && (temp <= 122)) {
word.append(temp);
state = 2;
} else {
word.setLength(0);
state = 0;
}
}else if (state == 2) {
if ((temp >= 97) && (temp <= 122)) {
word.append(temp);
state = 3;
} else {
word.setLength(0);
state = 0;
}
}else if (state == 3) {
if ((temp >= 97) && (temp <= 122)) {
word.append(temp);
state = 4;
} else {
word.setLength(0);
state = 0;
}
}else if (state == 4) {
if (((temp >= 97) && (temp <= 122)) || ((temp >= ‘0‘) && (temp <= ‘9‘))) {
word.append(temp);
} else {
wordlist.add(word.toString());
word.setLength(0);
state = 0;
}
}
}
if (state == 4) {
wordlist.add(word.toString());
}
@Test
public void testPhraseFrequency2_1() {
phraseFrequencyCounter phraseFrequencyCounter = new phraseFrequencyCounter();
int topPhrasenum = 0;
ArrayList<HashMap.Entry<String, Integer>> wordList = new ArrayList<HashMap.Entry<String, Integer>>();
HashMap<String, Integer> wordlistMap = new HashMap<String, Integer>();
wordlistMap = phraseFrequencyCounter.countPhraseFrequency("D:\\java_project\\031602435&031602414\\src\\test\\java\\wordCount2_031602435\\"+"result.txt",1,3);
wordList = phraseFrequencyCounter.topFrequentPhrases(wordlistMap);
topPhrasenum = wordList.get(0).getValue();
assertEquals(97, topPhrasenum);
}
@Test
public void testWordNum1() {
wordNumberCounter wordNumberCounter = new wordNumberCounter();
long wordnum =wordNumberCounter.countWord("D:\\java_project\\031602435&031602414\\src\\test\\java\\wordCount2_031602435\\"+"text1.txt");
System.out.println(wordnum);
assertEquals(9, wordnum);
}
@Test
public void testPhraseFrequency1_1() {
phraseFrequencyCounter phraseFrequencyCounter = new phraseFrequencyCounter();
int topPhrasenum = 0;
ArrayList<HashMap.Entry<String, Integer>> wordList = new ArrayList<HashMap.Entry<String, Integer>>();
HashMap<String, Integer> wordlistMap = new HashMap<String, Integer>();
wordlistMap = phraseFrequencyCounter.countPhraseFrequency("D:\\java_project\\031602435&031602414\\src\\test\\java\\wordCount2_031602435\\"+"text1.txt",1,3);
wordList = phraseFrequencyCounter.topFrequentPhrases(wordlistMap);
topPhrasenum = wordList.get(0).getValue();
assertEquals(11, topPhrasenum);
}
这次的用时虽然有20多小时,但是完成总时间是在两天半(不包括学习技术)。。所以只有一条记录。
认真负责
java:2->2.4
自主学习能力:2->2.6
原文:https://www.cnblogs.com/daydreams/p/9781184.html