1.Cosine similarity
package org.apache.commons.text.similarity;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
/**
* Measures the Cosine similarity of two vectors of an inner product space and
* compares the angle between them.
*
* <p>
* For further explanation about the Cosine Similarity, refer to
* http://en.wikipedia.org/wiki/Cosine_similarity.
* </p>
*
* @since 1.0
*/
public class CosineSimilarity {
/**
* Calculates the cosine similarity for two given vectors.
*
* @param leftVector left vector
* @param rightVector right vector
* @return cosine similarity between the two vectors
*/
public Double cosineSimilarity(final Map<CharSequence, Integer> leftVector, final Map<CharSequence, Integer> rightVector) {
if (leftVector == null || rightVector == null) {
throw new IllegalArgumentException("Vectors must not be null");
}
final Set<CharSequence> intersection = getIntersection(leftVector, rightVector);
final double dotProduct = dot(leftVector, rightVector, intersection);
double d1 = 0.0d;
for (final Integer value : leftVector.values()) {
d1 += Math.pow(value, 2);
}
double d2 = 0.0d;
for (final Integer value : rightVector.values()) {
d2 += Math.pow(value, 2);
}
double cosineSimilarity;
if (d1 <= 0.0 || d2 <= 0.0) {
cosineSimilarity = 0.0;
} else {
cosineSimilarity = (double) (dotProduct / (double) (Math.sqrt(d1) * Math.sqrt(d2)));
}
return cosineSimilarity;
}
/**
* Returns a set with strings common to the two given maps.
*
* @param leftVector left vector map
* @param rightVector right vector map
* @return common strings
*/
private Set<CharSequence> getIntersection(final Map<CharSequence, Integer> leftVector,
final Map<CharSequence, Integer> rightVector) {
final Set<CharSequence> intersection = new HashSet<>(leftVector.keySet());
intersection.retainAll(rightVector.keySet());
return intersection;
}
/**
* Computes the dot product of two vectors. It ignores remaining elements. It means
* that if a vector is longer than other, then a smaller part of it will be used to compute
* the dot product.
*
* @param leftVector left vector
* @param rightVector right vector
* @param intersection common elements
* @return the dot product
*/
private double dot(final Map<CharSequence, Integer> leftVector, final Map<CharSequence, Integer> rightVector,
final Set<CharSequence> intersection) {
long dotProduct = 0;
for (final CharSequence key : intersection) {
dotProduct += leftVector.get(key) * rightVector.get(key);
}
return dotProduct;
}
}
2.JaccardSimilarity
package org.apache.commons.text.similarity;
import java.util.HashSet;
import java.util.Set;
/**
* Measures the Jaccard similarity (aka Jaccard index) of two sets of character
* sequence. Jaccard similarity is the size of the intersection divided by the
* size of the union of the two sets.
*
* <p>
* For further explanation about Jaccard Similarity, refer
* https://en.wikipedia.org/wiki/Jaccard_index
* </p>
*
* @since 1.0
*/
public class JaccardSimilarity implements SimilarityScore<Double> {
/**
* Calculates Jaccard Similarity of two set character sequence passed as
* input.
*
* @param left first character sequence
* @param right second character sequence
* @return index
* @throws IllegalArgumentException
* if either String input {@code null}
*/
@Override
public Double apply(CharSequence left, CharSequence right) {
if (left == null || right == null) {
throw new IllegalArgumentException("Input cannot be null");
}
return Math.round(calculateJaccardSimilarity(left, right) * 100d) / 100d;
}
/**
* Calculates Jaccard Similarity of two character sequences passed as
* input. Does the calculation by identifying the union (characters in at
* least one of the two sets) of the two sets and intersection (characters
* which are present in set one which are present in set two)
*
* @param left first character sequence
* @param right second character sequence
* @return index
*/
private Double calculateJaccardSimilarity(CharSequence left, CharSequence right) {
Set<String> intersectionSet = new HashSet<String>();
Set<String> unionSet = new HashSet<String>();
boolean unionFilled = false;
int leftLength = left.length();
int rightLength = right.length();
if (leftLength == 0 || rightLength == 0) {
return 0d;
}
for (int leftIndex = 0; leftIndex < leftLength; leftIndex++) {
unionSet.add(String.valueOf(left.charAt(leftIndex)));
for (int rightIndex = 0; rightIndex < rightLength; rightIndex++) {
if (!unionFilled) {
unionSet.add(String.valueOf(right.charAt(rightIndex)));
}
if (left.charAt(leftIndex) == right.charAt(rightIndex)) {
intersectionSet.add(String.valueOf(left.charAt(leftIndex)));
}
}
unionFilled = true;
}
return Double.valueOf(intersectionSet.size()) / Double.valueOf(unionSet.size());
}
}
3.LevenshteinDistance
/**
* LevenshteinDistance
* copied from https://commons.apache.org/proper/commons-lang/javadocs/api-2.5/src-html/org/apache/commons/lang/StringUtils.html#line.6162
*/
public static int getLevenshteinDistance(String s, String t) {
if (s == null || t == null) {
throw new IllegalArgumentException("Strings must not be null");
}
int n = s.length(); // length of s
int m = t.length(); // length of t
if (n == 0) {
return m;
} else if (m == 0) {
return n;
}
if (n > m) {
// swap the input strings to consume less memory
String tmp = s;
s = t;
t = tmp;
n = m;
m = t.length();
}
int p[] = new int[n + 1]; //‘previous‘ cost array, horizontally
int d[] = new int[n + 1]; // cost array, horizontally
int _d[]; //placeholder to assist in swapping p and d
// indexes into strings s and t
int i; // iterates through s
int j; // iterates through t
char t_j; // jth character of t
int cost; // cost
for (i = 0; i <= n; i++) {
p[i] = i;
}
for (j = 1; j <= m; j++) {
t_j = t.charAt(j - 1);
d[0] = j;
for (i = 1; i <= n; i++) {
cost = s.charAt(i - 1) == t_j ? 0 : 1;
// minimum of cell to the left+1, to the top+1, diagonally left and up +cost
d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost);
}
// copy current distance counts to ‘previous row‘ distance counts
_d = p;
p = d;
d = _d;
}
// our last action in the above loop was to switch d and p, so p now
// actually has the most recent cost counts
return p[n];
}
4.JaroWinklerDistance
/**
* JaroWinklerDistance
* Copied from https://commons.apache.org/sandbox/commons-text/jacoco/org.apache.commons.text.similarity/JaroWinklerDistance.java.html
* apply method changed to similarity
*/
public static Double similarity(final CharSequence left, final CharSequence right) {
final double defaultScalingFactor = 0.1;
final double percentageRoundValue = 100.0;
if (left == null || right == null) {
throw new IllegalArgumentException("Strings must not be null");
}
int[] mtp = matches(left, right);
double m = mtp[0];
if (m == 0) {
return 0D;
}
double j = ((m / left.length() + m / right.length() + (m - mtp[1]) / m)) / 3;
double jw = j < 0.7D ? j : j + Math.min(defaultScalingFactor, 1D / mtp[3]) * mtp[2] * (1D - j);
return Math.round(jw * percentageRoundValue) / percentageRoundValue;
}
protected static int[] matches(final CharSequence first, final CharSequence second) {
CharSequence max, min;
if (first.length() > second.length()) {
max = first;
min = second;
} else {
max = second;
min = first;
}
int range = Math.max(max.length() / 2 - 1, 0);
int[] matchIndexes = new int[min.length()];
Arrays.fill(matchIndexes, -1);
boolean[] matchFlags = new boolean[max.length()];
int matches = 0;
for (int mi = 0; mi < min.length(); mi++) {
char c1 = min.charAt(mi);
for (int xi = Math.max(mi - range, 0), xn = Math.min(mi + range + 1, max.length()); xi < xn; xi++) {
if (!matchFlags[xi] && c1 == max.charAt(xi)) {
matchIndexes[mi] = xi;
matchFlags[xi] = true;
matches++;
break;
}
}
}
char[] ms1 = new char[matches];
char[] ms2 = new char[matches];
for (int i = 0, si = 0; i < min.length(); i++) {
if (matchIndexes[i] != -1) {
ms1[si] = min.charAt(i);
si++;
}
}
for (int i = 0, si = 0; i < max.length(); i++) {
if (matchFlags[i]) {
ms2[si] = max.charAt(i);
si++;
}
}
int transpositions = 0;
for (int mi = 0; mi < ms1.length; mi++) {
if (ms1[mi] != ms2[mi]) {
transpositions++;
}
}
int prefix = 0;
for (int mi = 0; mi < min.length(); mi++) {
if (first.charAt(mi) == second.charAt(mi)) {
prefix++;
} else {
break;
}
}
return new int[] { matches, transpositions / 2, prefix, max.length() };
}
参考资料:
【1】https://stackoverflow.com/questions/955110/similarity-string-comparison-in-java
【2】https://zatackcoder.com/java-program-to-check-two-strings-similarity/
原文:https://blog.51cto.com/15015181/2556388