import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.mllib.stat.MultivariateStatisticalSummary; import org.apache.spark.mllib.stat.Statistics; JavaSparkContext jsc = ... JavaRDD<Vector> mat = ... // an RDD of Vectors // Compute column summary statistics. MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd()); System.out.println(summary.mean()); // a dense vector containing the mean value for each column System.out.println(summary.variance()); // column-wise variance System.out.println(summary.numNonzeros()); // number of nonzeros in each column
import org.apache.spark.api.java.JavaDoubleRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.mllib.linalg.*; import org.apache.spark.mllib.stat.Statistics; JavaSparkContext jsc = ... JavaDoubleRDD seriesX = ... // a series JavaDoubleRDD seriesY = ... // must have the same number of partitions and cardinality as seriesX // compute the correlation using Pearson‘s method. Enter "spearman" for Spearman‘s method. If a // method is not specified, Pearson‘s method will be used by default. Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson"); JavaRDD<Vector> data = ... // note that each Vector is a row and not a column // calculate the correlation matrix using Pearson‘s method. Use "spearman" for Spearman‘s method. // If a method is not specified, Pearson‘s method will be used by default. Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson");
import java.util.Map; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaSparkContext; JavaSparkContext jsc = ... JavaPairRDD<K, V> data = ... // an RDD of any key value pairs Map<K, Object> fractions = ... // specify the exact fraction desired from each key // Get an exact sample from each stratum JavaPairRDD<K, V> approxSample = data.sampleByKey(false, fractions); JavaPairRDD<K, V> exactSample = data.sampleByKeyExact(false, fractions);
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.mllib.linalg.*; import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.mllib.stat.Statistics; import org.apache.spark.mllib.stat.test.ChiSqTestResult; JavaSparkContext jsc = ... Vector vec = ... // a vector composed of the frequencies of events // compute the goodness of fit. If a second vector to test against is not supplied as a parameter, // the test runs against a uniform distribution. ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec); // summary of the test including the p-value, degrees of freedom, test statistic, the method used, // and the null hypothesis. System.out.println(goodnessOfFitTestResult); Matrix mat = ... // a contingency matrix // conduct Pearson‘s independence test on the input contingency matrix ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat); // summary of the test including the p-value, degrees of freedom... System.out.println(independenceTestResult); JavaRDD<LabeledPoint> obs = ... // an RDD of labeled points // The contingency table is constructed from the raw (feature, label) pairs and used to conduct // the independence test. Returns an array containing the ChiSquaredTestResult for every feature // against the label. ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd()); int i = 1; for (ChiSqTestResult result : featureTestResults) { System.out.println("Column " + i + ":"); System.out.println(result); // summary of the test i++; }
import java.util.Arrays; import org.apache.spark.api.java.JavaDoubleRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.mllib.stat.Statistics; import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult; JavaSparkContext jsc = ...JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.2, 1.0, ...)); KolmogorovSmirnovTestResult testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0); // summary of the test including the p-value, test statistic, // and null hypothesis // if our p-value indicates significance, we can reject the null hypothesis System.out.println(testResult);
import org.apache.spark.SparkContext; import org.apache.spark.api.JavaDoubleRDD; import static org.apache.spark.mllib.random.RandomRDDs.*; JavaSparkContext jsc = ... // Generate a random double RDD that contains 1 million i.i.d. values drawn from the // standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions. JavaDoubleRDD u = normalJavaRDD(jsc, 1000000L, 10); // Apply a transform to get a random double RDD following `N(1, 4)`. JavaDoubleRDD v = u.map( new Function<Double, Double>() { public Double call(Double x) { return 1.0 + 2.0 * x; } });
import org.apache.spark.mllib.stat.KernelDensity; import org.apache.spark.rdd.RDD; RDD<Double> data = ... // an RDD of sample data // Construct the density estimator with the sample data and a standard deviation for the Gaussian // kernels KernelDensity kd = new KernelDensity() .setSample(data) .setBandwidth(3.0); // Find density estimates for the given values double[] densities = kd.estimate(new double[] {-1.0, 2.0, 5.0});
原文:http://www.cnblogs.com/yuguoshuo/p/6265711.html