一、词频对象 WordWrap
public class WordWrap implements Comparable { public String word; public int count; public WordWrap(String s, int count) { this.word = s; this.count = count; } @Override public int compareTo(Object o) { WordWrap oo = (WordWrap) o; if (this.count > oo.count) return 1; if (this.count < oo.count) return -1; return this.word.compareTo(oo.word); } @Override public String toString() { return "WordWrap{" + "word=‘" + word + ‘\‘‘ + ", count=" + count + ‘}‘; } }
二、小顶堆和测试方法 SmallTopHeap
/** * 小顶堆. * 求 TOP K问题 * * @author james.ding */ public class SmallTopHeap<E> { private Object[] queue; private int capacity; private int count; public static void main(String[] args) { String a = "We are driven to create the most powerful platform for developing time series applications – one that excites and propels developers to be wildly successful.\n" + "\n" + "The explosion in the amount of data that is being created is expected to grow nearly fivefold by 2025 to 175 zettabytes per year, primarily from applications, networks, containers, and a projected 60 billion IoT devices. While there is a tremendous opportunity for organizations to leverage this data to enhance customer experiences, improve employee and process productivity, and to create competitive advantage, it will be challenging to store and analyze the large volumes and high-frequency streams of data.\n" + "\n" + "We created InfluxDB, a purpose-built time series platform, to specifically handle these new workloads, encompassing millions of data points and hundreds of data sources because general-purpose databases are inadequate. And to derive intelligence faster, we created Flux, a powerful scripting and query engine designed specifically to transform time series data into information and enable real-time decisions.\n" + "\n" + "We are dedicated a a a a a a a a a a to open source and truly care about helping developers get to results faster with less complexity and less code. We help customers like Cisco, IBM, PayPal, and Tesla build transformative IoT, analytics and monitoring applications quicker and to scale, delivering new insights from their data.\n" + "\n" + "InfluxData is backed by Norwest Venture Partners, Sapphire Ventures, Battery Ventures, Trinity Ventures, Mayfield, Harmony Partners, Sorenson Capital, Bloomberg Beta and Y Combinator. We are headquartered in San Francisco, with offices in Austin and team members distributed throughout the U.S. and across Europe."; //构建散列表 String[] words = a.split(" "); Map<String, Integer> map = new HashMap(); for (String s : words) map.put(s, map.getOrDefault(s, 0) + 1); //出现频率最高的单词 top 5 SmallTopHeap<WordWrap> smallTopHeap = new SmallTopHeap<>(5); //遍历散列表往小顶堆中插入数据 for (String key : map.keySet()) smallTopHeap.insert(new WordWrap(key, map.get(key))); System.out.println(smallTopHeap); } private final Comparator<? super E> comparator; public SmallTopHeap(int capacity) { queue = new Object[capacity + 1];//从第1个开始存储数据 this.capacity = capacity; this.comparator = null; this.count = 0; } public SmallTopHeap(int capacity, Comparator<? super E> comparator) { queue = new Object[capacity + 1];//从第1个开始存储数据 this.capacity = capacity; this.comparator = comparator; this.count = 0; } /** * 插入元素 */ public void insert(E e) { if (e == null) throw new NullPointerException(); int i; if (count >= capacity) { //小顶堆 //如果新元素大于等于堆顶元素。 则删除堆顶元素。 再插入新元素。 if (comparator == null) { Comparable<? super E> top = (Comparable<? super E>) queue[1]; Comparable<? super E> data = (Comparable<? super E>) e; if (data.compareTo((E) top) >= 0) { removeTop(); insert(e); } } else { if (comparator.compare((E) e, (E) queue[1]) >= 0) { removeTop(); insert(e); } } return; } else { ++count; queue[count] = e; i = count; } //自下往上堆化 heapifySiftUp(i); } private void heapifySiftUp(int i) { if (comparator == null) { siftUpComparable(i); } else { siftUpUsingComparator(i); } } private void siftUpUsingComparator(int i) { while (i / 2 >= 0 && queue[i / 2] != null && comparator.compare((E) queue[i], (E) queue[i / 2]) < 0) { swap(queue, i, i / 2); i = i / 2; } } private void siftUpComparable(int i) { Comparable<? super E> key = (Comparable<? super E>) queue[i]; Comparable<? super E> keyParent = (Comparable<? super E>) queue[i / 2]; while (i / 2 >= 0 && (Comparable<? super E>) queue[i / 2] != null && key.compareTo((E) keyParent) < 0) { swap(queue, i, i / 2); i = i / 2; key = (Comparable<? super E>) queue[i]; keyParent = (Comparable<? super E>) queue[i / 2]; } } /** * 删除堆顶元素 */ public void removeTop() { if (count == 0) return; //删除堆顶元素,并将最后一个元素移到堆顶。此时堆的性质已经被破坏,接下来需要进行堆化。 queue[1] = queue[count]; --count; //自上往下堆化 heapifySiftDown(queue, 1); } private void heapifySiftDown(Object[] que, int i) { //自上往下堆化 if (comparator == null) { siftDownComparable(que, i); } else { siftDownUsingComparator(que, i); } } private void siftDownComparable(Object[] que, int i) { while (true) { int maxPos = i; if (i * 2 < capacity) { Comparable<? super E> key = (Comparable<? super E>) que[i]; Comparable<? super E> keyChildLeft = (Comparable<? super E>) que[i * 2]; if (key.compareTo((E) keyChildLeft) > 0) { maxPos = i * 2; } } if (i * 2 + 1 < capacity) { Comparable<? super E> key = (Comparable<? super E>) que[maxPos]; Comparable<? super E> keyChildRight = (Comparable<? super E>) que[i * 2 + 1]; if (key.compareTo((E) keyChildRight) > 0) { maxPos = i * 2 + 1; } } if (maxPos == i) { break; } swap(que, maxPos, i); i = maxPos; } } private void siftDownUsingComparator(Object[] que, int i) { while (true) { int maxPos = i; if (i * 2 < capacity && comparator.compare((E) que[i], (E) que[i * 2]) > 0) { maxPos = i * 2; } if (i * 2 + 1 < capacity && comparator.compare((E) que[maxPos], (E) que[i * 2 + 1]) > 0) { maxPos = i * 2 + 1; } if (maxPos == i) { break; } swap(que, maxPos, i); i = maxPos; } } @Override public String toString() { String result = ""; if (queue != null && count > 0) { int i = 0; for (Object e : queue) { String s = " queue[" + i++ + "] " + e; result += s + "\n"; } } return "HeapV{" + "a=\n" + result + ", n=" + capacity + ", count=" + count + ", comparator=" + comparator + ‘}‘; } }
三、执行SmallTopHeap中的main方法,得出文本中的词频最高的top 5
Heap{a= queue[0] null queue[1] WordWrap{word=‘the‘, count=4} queue[2] WordWrap{word=‘data‘, count=5} queue[3] WordWrap{word=‘a‘, count=14} queue[4] WordWrap{word=‘to‘, count=14} queue[5] WordWrap{word=‘and‘, count=17} , n=5, count=5, comparator=null}