哈哈,你想知道jdk源码中的英语单词分布情况么?下面我们介绍一种jdk1.7的新api来递归遍历文件夹。这个新的api位于java.nio.file.*下面,里面添加了更加符合语义的Path,也就是路劲这个对象。然后为了更好遍历目录,提供了一种访问者设计模式来访问递归访问目录。主要的框架代码如下:
<span style="font-size:18px;">public static class PrintFiles
extends SimpleFileVisitor<Path> {
// Print information about
// each type of file.
@Override
public FileVisitResult visitFile(Path file,
BasicFileAttributes attr) {
if (attr.isSymbolicLink()) {
System.out.format("Symbolic link: %s ", file);
} else if (attr.isRegularFile()) {
System.out.format("Regular file: %s ", file);
} else {
System.out.format("Other: %s ", file);
}
System.out.println("(" + attr.size() + "bytes)");
return CONTINUE;
}
// Print each directory visited.
@Override
public FileVisitResult postVisitDirectory(Path dir,
IOException exc) {
System.out.format("Directory: %s%n", dir);
return CONTINUE;
}
// If there is some error accessing
// the file, let the user know.
// If you don't override this method
// and an error occurs, an IOException
// is thrown.
@Override
public FileVisitResult visitFileFailed(Path file,
IOException exc) {
System.err.println(exc);
return CONTINUE;
}
}</span>这个接口主要有三个方法,访问文件、访问目录、访问失败。很明显,我们只需要继承该类,然后实现相关业务逻辑即可。当需要遍历某个目录是,直接调用Files.walkFileTree(startingDir, SimpleFileVistor)方法即可。其中上面的例子什么也不做,只是简单的打印出一些相关信息。
进入正题,怎么统计JDK源码中的英语单词分布呢?下面先分解下我们的任务。在这里面,需要一个类用于解析一个.java文件,并且将里面的单词都抽取出来,好吧,该类不难,无非就是读取类文件内容,然后使用相关的方式,将这些杂乱的内容变成单词即可。为了更加容易管理,这里将单词也抽象为一个类,具体代码如下:
package net.itaem.luohong.honghong;
public class Word implements Comparable<Word> {
private String word;
public Word(String word){
this.word = word;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((word == null) ? 0 : word.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
Word other = (Word) obj;
if (word == null) {
if (other.word != null)
return false;
} else if (!word.equals(other.word))
return false;
return true;
}
@Override
public String toString() {
return word;
}
public String getWord(){
return word;
}
public int compareTo(Word o) {
if(this.getWord() == null){
return -1;
}
if(o.getWord() == null){
return 1;
}
return this.getWord().compareTo(o.getWord());
}
}
package net.itaem.luohong.honghong;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
/**
* 读取一个文件的内容
* */
public class WordReader {
public static List<Word> wordInFile(String path){
return stringToWords(read(path));
}
/**
* 读取一个文件的内容,然后将内容返回
*
* */
private static String read(String path){
StringBuilder sb = new StringBuilder();
try {
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(new File(path))));
String line = null;
while((line = reader.readLine()) != null){
sb.append(line);
}
reader.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return sb.toString();
}
/**
* 将字符串变成单词数组
* */
private static List<Word> stringToWords(String source){
StringTokenizer strToken = new StringTokenizer(source);
List<Word> wordList = new ArrayList<Word>();
while(strToken.hasMoreTokens()){
String wordStr = strToken.nextToken();
if(!isInvalidString(wordStr)){
wordStr = removeInvalidStr(wordStr); //确保去除干净
wordStr = removeInvalidStr(wordStr);
if(wordStr != null && !"".equals(wordStr)){
if(needSplitToWord(wordStr)){
recurSplitToWord(wordStr, wordList);
}else{ //不需要切割
if(wordStr != null && !"".equals(wordStr) && !isInvalidString(wordStr)){
wordList.add(new Word(removeInvalidStr(wordStr)));
}
}
}
}
}
return wordList;
}
/**
* 判断字符串是否需要分割为多个单词
* @param wordStr
* @return
* */
private static boolean needSplitToWord(String wordStr){
if(wordStr == null || "".equals(wordStr)) return false;
if(wordStr.contains(".") || wordStr.contains("/") || wordStr.contains("-")
|| wordStr.contains("#") || wordStr.contains("]")
|| wordStr.contains(",") || wordStr.contains("(")
|| wordStr.contains("[") || wordStr.contains(">")
|| wordStr.contains("<") || wordStr.contains("=")){
return true;
}else{
return false;
}
}
/**
* 递归切割字符串为单词列表
* 因为一个字符串可能同时包含多个分割单词的字符,所以需要递归切割
*
* @param wordStr 要切割的字符串
* @param wordList 将切割的单词放入该list
* */
private static void recurSplitToWord(String wordStr, List<Word> wordList){
if(wordStr == null) return;
if(needSplitToWord(wordStr)){
String[] words = splitToWords0(wordStr);
for(String word: words){
if(needSplitToWord(word)){
recurSplitToWord(word, wordList);
}else{
if(word != null && !"".equals(word) && !isInvalidString(word)){
wordList.add(new Word(removeInvalidStr(word)));
}
}
}
}
}
/**
* 将一个字符串切割为单词数组
* @param str
* @return
* */
private static String[] splitToWords0(String wordStr){
String[] words = null;
//split word
if(wordStr.contains(".")){
words = wordStr.split("\\.");
}else if(wordStr.contains("/")){
words = wordStr.split("/");
}else if(wordStr.contains("-")){
words = wordStr.split("-");
}else if(wordStr.contains("#")){
words = wordStr.split("#");
}else if(wordStr.contains("]")){
words = wordStr.split("]");
}else if(wordStr.contains("[")){
words = wordStr.split("\\[");
}else if(wordStr.contains(",")){
words = wordStr.split(",");
}else if(wordStr.contains("(")){
words = wordStr.split("\\(");
}else if(wordStr.contains(">")){
words = wordStr.split(">");
}else if(wordStr.contains("<")){
words = wordStr.split("<");
}else if(wordStr.contains("=")){
words = wordStr.split("=");
}
return words;
}
/**
* 去掉 + - * / >= <= =!= ||等无效字符
* @param str 字符串
* @return
* */
private static boolean isInvalidString(String str){
if(str == null || str.equals("") || str.equals("*") || str.equals("{") ||str.equals("}") || str.equals("+") ||str.equals("-") || str.equals("/")
|| str.equals("=") || str.equals("!=") || str.equals(">") || str.equals("<") || str.equals(">=") || str.equals("<=")
|| str.equals("||") || str.equals("}}") || str.equals("/*") || str.equals("};") || str.equals("+=") || str.matches("\\d+")
|| str.equals(".") || str.equals(":") || str.equals("/**") || str.equals("//") || str.endsWith("==") || str.equals("?")
|| (str.contains("==") && str.contains("&") && str.contains(";")) //去掉o==null ? get这类字符
){
return true;
}else{
return false;
}
}
/**
* 判断一个字符串是否是数字
* */
private static boolean isNumber(String str){
return str.matches("^[-+]?(([0-9]+)([.]([0-9]+))?|([.]([0-9]+))?)$");
}
/**
* 去掉一个字符串中的无效字符
* @param
* @return
* */
private static String removeInvalidStr(String wordStr){
if(isInvalidString(wordStr)){
return null;
}
//去掉结尾;
if(wordStr.endsWith(";")){
wordStr = wordStr.substring(0, wordStr.length() - 1);
}
//去掉尾部,
if(wordStr.endsWith(",")){
wordStr = wordStr.substring(0, wordStr.length() - 1);
}
//去掉结尾()
if(wordStr.endsWith("()")){
wordStr = wordStr.substring(0, wordStr.length() - 2);
}
//去掉开头(
if(wordStr.startsWith("(")){
wordStr = wordStr.substring(1);
}
//去掉结尾)
if(wordStr.endsWith(")")){
wordStr = wordStr.substring(0, wordStr.length() - 1);
}
//去掉结尾:
if(wordStr.endsWith(":")){
wordStr = wordStr.substring(0, wordStr.length() - 1);
}
//去掉开头"
if(wordStr.startsWith("\"")){
wordStr = wordStr.substring(1);
}
//去掉结尾"
if(wordStr.endsWith("\"")){
wordStr = wordStr.substring(0, wordStr.length() - 1);
}
//去掉结尾处.
if(wordStr.endsWith(".")){
wordStr = wordStr.substring(0, wordStr.length() - 1);
}
//去掉开头*/
if(wordStr.startsWith("*/")){
wordStr = wordStr.substring(2);
}
//去掉java.util;/**结尾处的四个字符
if(wordStr.endsWith(";/**")){
wordStr = wordStr.substring(0, wordStr.length() - 4);
}
//去掉开头的{@
if(wordStr.startsWith("{@")){
wordStr = wordStr.substring(2);
}
//去掉开头的@
if(wordStr.startsWith("@")){
wordStr = wordStr.substring(1);
}
//取出下面该格式的单词,比如:<tt>hello</tt>取出hello,<li>world</li>取出为world,<T>取出为T,</pre>取出为pre,<pre>取出为pre
if(wordStr.startsWith("<") && wordStr.endsWith(">")){
if(wordStr.startsWith("<") && !wordStr.contains("</")){ //格式为<T>
wordStr = wordStr.substring(wordStr.indexOf("<") + 1, wordStr.lastIndexOf(">"));
}else if(wordStr.contains("</") && !wordStr.startsWith("</")){ //格式为:<tt>hello</tt>
wordStr = wordStr.substring(wordStr.indexOf(">") + 1, wordStr.lastIndexOf("</"));
}else if(wordStr.startsWith("</")){ //格式为</pre>
wordStr = wordStr.substring(2, wordStr.lastIndexOf(">"));
}
}
//去掉<li>time中的<li>
if(wordStr.contains("<") && wordStr.contains(">") && !wordStr.contains("/") && wordStr.startsWith("<")){
wordStr = wordStr.substring(wordStr.lastIndexOf(">") + 1);
}
//去掉time<li>中的<li>
if(wordStr.contains("<") && wordStr.contains(">") && !wordStr.contains("/") && wordStr.endsWith(">")){
wordStr = wordStr.substring(0, wordStr.lastIndexOf("<"));
}
//去掉time</li>
if(wordStr.contains("</") && wordStr.contains(">")){
wordStr = wordStr.substring(0, wordStr.lastIndexOf("</"));
}
//去掉开头的<
if(wordStr.startsWith("<")){
wordStr = wordStr.substring(1);
}
//去掉结尾的>
if(wordStr.endsWith(">")){
wordStr = wordStr.substring(0, wordStr.length() -1);
}
//去掉结尾的[
if(wordStr.startsWith("{")){
wordStr = wordStr.substring(1);
}
//去掉开头的[
if(wordStr.endsWith("}")){
wordStr = wordStr.substring(0, wordStr.length() - 1);
}
//去掉开头的==
if(wordStr.startsWith("==")){
wordStr = wordStr.substring(2);
}
//去掉结尾的=
if(wordStr.endsWith("=")){
wordStr = wordStr.substring(0, wordStr.length() - 1);
}
//去掉结尾的{
//去掉开头的[
if(wordStr.endsWith("{")){
wordStr = wordStr.substring(0, wordStr.length() - 1);
}
//去掉结尾的[
if(wordStr.endsWith("[]")){
wordStr = wordStr.substring(0, wordStr.length() - 2);
}
//去掉结尾的[
if(wordStr.endsWith("]")){
wordStr = wordStr.substring(0, wordStr.length() - 1);
}
//去掉结尾的[
if(wordStr.endsWith("[")){
wordStr = wordStr.substring(0, wordStr.length() - 1);
}
//去掉开头的[
if(wordStr.startsWith("]")){
wordStr = wordStr.substring(1);
}
//去掉开头的+
if(wordStr.startsWith("+")){
wordStr = wordStr.substring(1);
}
//去掉<?
if(wordStr.endsWith("+")){
wordStr = wordStr.substring(0, wordStr.length() - 1);
}
//去掉<?
if(wordStr.endsWith("<?")){
wordStr = wordStr.substring(0, wordStr.length() - 2);
}
//去掉"
if(wordStr.contains("\"")){
wordStr = wordStr.replace("\"", "");
}
//去掉开头的[
//去掉数字
if(isNumber(wordStr)){
return null;
}
return wordStr;
}
}
当然,已经可以将每个文件的单词抽取出来了,下面就使用一个类汇总单词信息即可。这里面为了最后结果可以从a-z按照字典排序,使用了TreeMap来统计结果,不多说,见代码:
package net.itaem.luohong.honghong;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
/**
* 用于统计每个单词出现的次数
*
* @author luohong 846705189@qq.com
* */
public class WordCount {
private Map<Word, Integer> wordCount;
public WordCount(){
wordCount = new TreeMap<Word, Integer>();
}
public int size(){
return wordCount.size();
}
/**
* 统计一个单词出现的次数
* @param word 要统计的单词
* @return 该单词出现的次数
* */
public Integer count(Word word){
if(wordCount.containsKey(word)){
return wordCount.put(word, wordCount.get(word) + 1);
}else{
return wordCount.put(word, 1);
}
}
public void count(List<Word> wordList){
for(Word word: wordList){
count(word);
}
}
/**
* 输出结果
* */
public void printResult(){
for(Word word: wordCount.keySet()){
System.out.println(word + ":" + wordCount.get(word));
}
}
}
<span style="font-size:18px;"><pre name="code" class="java">package net.itaem.luohong.honghong;
import static java.nio.file.FileVisitResult.CONTINUE;
import java.io.File;
import java.io.IOException;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.List;
/**
* 递归遍历目录
*
* */
public class PrintFiles extends SimpleFileVisitor<Path> {
WordCount wordCount = new WordCount();
// Print information about
// each type of file.
@Override
public FileVisitResult visitFile(Path file,
BasicFileAttributes attr) {
if (attr.isSymbolicLink()) {
} else if (attr.isRegularFile()) {
List<Word> words = WordReader.wordInFile(file.toString());
System.out.println(file);
for(Word word: words){
wordCount.count(word);
}
//System.out.println(words);
} else {
//System.out.format("Other: %s ", file);
}
return CONTINUE;
}
// Print each directory visited.
@Override
public FileVisitResult postVisitDirectory(Path dir,
IOException exc) {
//System.out.format("Directory: %s%n", dir);
return CONTINUE;
}
// If there is some error accessing
// the file, let the user know.
// If you don't override this method
// and an error occurs, an IOException
// is thrown.
@Override
public FileVisitResult visitFileFailed(Path file, IOException exc) {
//System.err.println(exc);
return CONTINUE;
}
public void count(String path){
Path startingDir = new File(path).toPath();
PrintFiles printFiles = new PrintFiles();
try {
Files.walkFileTree(startingDir, printFiles);
wordCount.printResult();
} catch (IOException e) {
e.printStackTrace();
}
}
</span><span style="font-size:18px;">
public static void main(String[] args) {
String path = "E:\\jar集合\\java source code";
System.out.println("enter the dictionary you want to count");
path = new CommandReader().readCommand();
new PrintFiles().count(path);
}
}</span><span style="font-size:18px;">package net.itaem.luohong.honghong;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
public class CommandReader{
public CommandReader(){
}
public String readCommand(){
BufferedReader reader = new BufferedReader(new InputStreamReader(System.in));
String line = null;
try {
line = reader.readLine();
} catch (IOException e) {
e.printStackTrace();
}finally{
try {
reader.close();
} catch (IOException e) {
}
}
return line;
}
}</span>
输出部分结果:
<span style="font-size:18px;">addExceptionDetailMessage:2 addFirst:2 addGuardedAction:2 addIDAtPosition:3 addIORComponentToProfileInternal:3 addInvocationHandler:3 addLast:14 addNodes:3 addObject:7 addPOA:2 addPoaInactive:9 addPoaManager:2 addPrefix:2 addRemoteContactInfos:2 addReply:12 addReplyServiceContextQueue:11 addServiceContext:3 addServiceContexts:2 addTaggedProfiles:3 addToErrorSet:7 addToIORTemplate:3 addToParser:2 addToProperties:2 addTypeCodeAtPosition:4 addWork:7 addXXX:1 add_client_request_interceptor:2 add_client_request_interceptor_with_policy:1 add_component:1 add_component_to_profile:1 add_exception:1 add_in_arg:1 add_inout_arg:1 add_ior_component:1 add_ior_component_to_profile:1 add_ior_interceptor:2 add_ior_interceptor_with_policy:1 add_item:4 add_named_in_arg:1 add_named_inout_arg:1 add_named_out_arg:1 add_out_arg:1 add_reply_service_context:2 add_request_service_context:1 add_server_request_interceptor:2 add_server_request_interceptor_with_policy:1 add_value:1 added:31 addedSerialOrExtern:2 adding:15 addition:5 additional:10 addr:29 addrDisp:12 addrDisposition:12 address:30 addresses:2 addressing:17 addressingDisposition:14 adds:9 adheres:1 adjust:3 admap:2 admin:1 advance:2 advancement:1 advancing:1 advertise:1 advertised:1 ae:10 af:1 affect:3 affected:1 affecting:2 affiliates:438 after:125 afterwards:8 again:23 against:10 againt:1 aggregate:1 agree:7 agreed:2 ahead:2 ai:1 aids:2 aka:1 alert:2 algorithm:11 alias:8 aliased:4 aliases:5 aliasing:2 align:50 alignAndCheck:15 alignAndReserve:16 alignIncr:2 alignOnBoundary:12 alignResult:2 aligned:17 alignment:43 alignmentForEncoding:2 alive:3 all:295 all*:1 all?:3 allMethodInfo:11 allMethodNames:3 allMethodsThrowRemoteException:2 allocComponents:7 allocate:15 allocateDirect:1 allocateServerRequestId:4 allocateSlotId:2 allocate_slot_id:4 allocated:13 allocating:3 allocation:3 allocations:2 allocator:1 allow:32 allowIndirection:2 allowLocalOptimization:3 allowed:23 allowing:8 allows:29 almost:3</span>
使用jdk 1.7的新api来统计jdk中英语单词的分布情况
原文:http://blog.csdn.net/u010469003/article/details/44625489