协同过滤源码路径:
~/project/javaproject/mahout-0.9/core/src $tree main/java/org/apache/mahout/cf/taste/ -L 2
main/java/org/apache/mahout/cf/taste/
├── common
│ ├── NoSuchItemException.java
│ ├── NoSuchUserException.java
│ ├── Refreshable.java
│ ├── TasteException.java
│ └── Weighting.java
├── eval
│ ├── DataModelBuilder.java
│ ├── IRStatistics.java
│ ├── RecommenderBuilder.java
│ ├── RecommenderEvaluator.java
│ ├── RecommenderIRStatsEvaluator.java
│ └── RelevantItemsDataSplitter.java
├── hadoop
│ ├── EntityEntityWritable.java
│ ├── EntityPrefWritable.java
│ ├── MutableRecommendedItem.java
│ ├── RecommendedItemsWritable.java
│ ├── TasteHadoopUtils.java
│ ├── ToEntityPrefsMapper.java
│ ├── ToItemPrefsMapper.java
│ ├── TopItemsQueue.java
│ ├── als
│ ├── item
│ ├── preparation
│ └── similarity
├── impl
│ ├── common
│ ├── eval
│ ├── model
│ ├── neighborhood
│ ├── recommender
│ └── similarity
├── model
│ ├── DataModel.java
│ ├── IDMigrator.java
│ ├── JDBCDataModel.java
│ ├── Preference.java
│ ├── PreferenceArray.java
│ └── UpdatableIDMigrator.java
├── neighborhood
│ └── UserNeighborhood.java
├── recommender
│ ├── CandidateItemsStrategy.java
│ ├── IDRescorer.java
│ ├── ItemBasedRecommender.java
│ ├── MostSimilarItemsCandidateItemsStrategy.java
│ ├── RecommendedItem.java
│ ├── Recommender.java
│ ├── Rescorer.java
│ └── UserBasedRecommender.java
└── similarity
├── ItemSimilarity.java
├── PreferenceInferrer.java
├── UserSimilarity.java
└── precompute
similarity 相似度的interface定义
recommender 推荐算法的interface定义
model 数据model类型的interface定义
impl 目录 则是以上interface定义的实现
PearsonCorrelationSimilarity的实现在
~/mahout-core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/PearsonCorrelationSimilarity.java
/** * @throws IllegalArgumentException if {@link DataModel} does not have preference values */ public PearsonCorrelationSimilarity(DataModel dataModel, Weighting weighting) throws TasteException { //这里CenterData传的时true /* pearson其实做的事情就是先把两个向量都减去他们的平均值,然后再计算cosine值。 * 在 AbstractSimilarity里的实现代码如下: * double result; if (centerData) { double meanX = sumX / count; double meanY = sumY / count; // double centeredSumXY = sumXY - meanY * sumX - meanX * sumY + n * meanX * meanY; double centeredSumXY = sumXY - meanY * sumX; // double centeredSumX2 = sumX2 - 2.0 * meanX * sumX + n * meanX * meanX; double centeredSumX2 = sumX2 - meanX * sumX; // double centeredSumY2 = sumY2 - 2.0 * meanY * sumY + n * meanY * meanY; double centeredSumY2 = sumY2 - meanY * sumY; result = computeResult(count, centeredSumXY, centeredSumX2, centeredSumY2, sumXYdiff2); } else { result = computeResult(count, sumXY, sumX2, sumY2, sumXYdiff2); } */ super(dataModel, weighting, true); Preconditions.checkArgument(dataModel.hasPreferenceValues(), "DataModel doesn‘t have preference values"); } @Override double computeResult(int n, double sumXY, double sumX2, double sumY2, double sumXYdiff2) { if (n == 0) { return Double.NaN; } // Note that sum of X and sum of Y don‘t appear here since they are assumed to be 0; // the data is assumed to be centered. double denominator = Math.sqrt(sumX2) * Math.sqrt(sumY2); if (denominator == 0.0) { // One or both parties has -all- the same ratings; // can‘t really say much similarity under this measure return Double.NaN; } return sumXY / denominator; }
就是数学公式的实现:
具体的累加,在interface里面已经做了,:
@Override public double userSimilarity(long userID1, long userID2) throws TasteException { DataModel dataModel = getDataModel();
//获取用户偏好 PreferenceArray xPrefs = dataModel.getPreferencesFromUser(userID1); PreferenceArray yPrefs = dataModel.getPreferencesFromUser(userID2); int xLength = xPrefs.length(); int yLength = yPrefs.length(); if (xLength == 0 || yLength == 0) { return Double.NaN; } long xIndex = xPrefs.getItemID(0); long yIndex = yPrefs.getItemID(0); int xPrefIndex = 0; int yPrefIndex = 0; double sumX = 0.0; double sumX2 = 0.0; double sumY = 0.0; double sumY2 = 0.0; double sumXY = 0.0; double sumXYdiff2 = 0.0; int count = 0; boolean hasInferrer = inferrer != null; while (true) { int compare = xIndex < yIndex ? -1 : xIndex > yIndex ? 1 : 0; if (hasInferrer || compare == 0) { double x; double y; if (xIndex == yIndex) { // Both users expressed a preference for the item x = xPrefs.getValue(xPrefIndex); y = yPrefs.getValue(yPrefIndex); } else { //如果不存在对应的分数,则进行推断... // Only one user expressed a preference, but infer the other one‘s preference and tally // as if the other user expressed that preference if (compare < 0) { // X has a value; infer Y‘s x = xPrefs.getValue(xPrefIndex); y = inferrer.inferPreference(userID2, xIndex); } else { // compare > 0 // Y has a value; infer X‘s x = inferrer.inferPreference(userID1, yIndex); y = yPrefs.getValue(yPrefIndex); } } sumXY += x * y; sumX += x; sumX2 += x * x; sumY += y; sumY2 += y * y; double diff = x - y; sumXYdiff2 += diff * diff; count++; } if (compare <= 0) { if (++xPrefIndex >= xLength) { if (hasInferrer) { // Must count other Ys; pretend next X is far away if (yIndex == Long.MAX_VALUE) { // ... but stop if both are done! break; } xIndex = Long.MAX_VALUE; } else { break; } } else { xIndex = xPrefs.getItemID(xPrefIndex); } } if (compare >= 0) { if (++yPrefIndex >= yLength) { if (hasInferrer) { // Must count other Xs; pretend next Y is far away if (xIndex == Long.MAX_VALUE) { // ... but stop if both are done! break; } yIndex = Long.MAX_VALUE; } else { break; } } else { yIndex = yPrefs.getItemID(yPrefIndex); } } } // "Center" the data. If my math is correct, this‘ll do it. double result; if (centerData) { double meanX = sumX / count; double meanY = sumY / count; // double centeredSumXY = sumXY - meanY * sumX - meanX * sumY + n * meanX * meanY; double centeredSumXY = sumXY - meanY * sumX; // double centeredSumX2 = sumX2 - 2.0 * meanX * sumX + n * meanX * meanX; double centeredSumX2 = sumX2 - meanX * sumX; // double centeredSumY2 = sumY2 - 2.0 * meanY * sumY + n * meanY * meanY; double centeredSumY2 = sumY2 - meanY * sumY; result = computeResult(count, centeredSumXY, centeredSumX2, centeredSumY2, sumXYdiff2); } else { result = computeResult(count, sumXY, sumX2, sumY2, sumXYdiff2); } if (!Double.isNaN(result)) { result = normalizeWeightResult(result, count, cachedNumItems); } return result; }
参考:
http://blog.csdn.net/v_july_v/article/details/7184318
http://blog.sina.com.cn/s/blog_73de143c010153vp.html
Apache mahout 源码阅读笔记--协同过滤, PearsonCorrelationSimilarity
原文:http://www.cnblogs.com/zhangqingping/p/4105401.html