[root@centos00 ~]$ cd hadoop-2.6.0-cdh5.14.2/
[root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/hadoop-daemon.sh start namenode
[root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/hadoop-daemon.sh start datanode
[root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/yarn-daemon.sh start resourcemanager
[root@centos00 ~]$ cd /opt/cdh5.14.2/hive-1.1.0-cdh5.14.2/
[root@centos00 hive-1.1.0-cdh5.14.2]$ bin/hive --service metastore &
[root@centos00 ~]$ cd /opt/cdh5.14.2/spark-2.2.1-cdh5.14.2/
[root@centos00 spark-2.2.1-cdh5.14.2]$ sbin/start-master.sh
[root@centos00 spark-2.2.1-cdh5.14.2]$ sbin/start-slaves.sh
scala> import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions._
scala> val arr = Array(("a", "20"), ("a", "30"), ("b", "20"), ("a", "20"))
arr: Array[(String, String)] = Array((a,20), (a,30), (b,20), (a,20))
scala> val df = sc.parallelize(arr).toDF("id", "age")
df: org.apache.spark.sql.DataFrame = [id: string, age: string]
scala> df.show(false)
+---+---+
|id |age|
+---+---+
|a |20 |
|a |30 |
|b |20 |
|a |20 |
+---+---+
scala> df.groupBy(‘id).agg(countDistinct(‘age) as ‘distinctAge).show(false)
+---+-----------+
|id |distinctAge|
+---+-----------+
|b |1 |
|a |2 |
+---+-----------+
scala> df.groupBy("id").agg(countDistinct("age") as "distinctAge").show(false)
+---+-----------+
|id |distinctAge|
+---+-----------+
|b |1 |
|a |2 |
+---+-----------+
原文:https://www.cnblogs.com/ji-hf/p/13665911.html