写脚本生成类似文件
java 代码
封装类
package day0327;
import java.util.UUID;
public class data {
private String ip;
private String address;
private String date;
private long timestamp;
// private UUID uuid;
private String uuid;
public String getUuid() {
return uuid;
}
public void setUuid(String uuid) {
this.uuid = uuid;
}
private String port;
private String method;
public String getIp() {
return ip;
}
public void setIp(String ip) {
this.ip = ip;
}
public String getAddress() {
return address;
}
public void setAddress(String address) {
this.address = address;
}
public String getDate() {
return date;
}
public void setDate(String date) {
this.date = date;
}
public long getTimestamp() {
return timestamp;
}
public void setTimestamp(long timestamp) {
this.timestamp = timestamp;
}
/*public UUID getUuid() {
return uuid;
}
public void setUuid(UUID uuid) {
this.uuid = uuid;
}*/
public String getPort() {
return port;
}
public void setPort(String port) {
this.port = port;
}
public String getMethod() {
return method;
}
public void setMethod(String method) {
this.method = method;
}
}
主函数
package day0327;
import entity.Costinfo;
import java.io.*;
import java.sql.Array;
import java.sql.Timestamp;
import java.text.SimpleDateFormat;
import java.util.*;
public class Main {
public static void main(String[] args) {
/*Date date=new Date();
SimpleDateFormat dateFormat=new SimpleDateFormat("yyyy-MM-dd");
System.out.println(dateFormat.format(date));
// String s=dateFormat.format(date);
// long a=date.getTime();
System.out.println(date.getTime());
// Timestamp timestamp=new Timestamp();
Timestamp timestamp=new Timestamp(date.getTime());
System.out.println(timestamp);
UUID uuid = UUID.randomUUID();
System.out.println (uuid);*/
int i=1;
List<data> datas=new ArrayList<>();
for (i=1;i<=10000;i++){
data data=new data();
Random random=new Random();
int ip1=random.nextInt(256);
int ip2=random.nextInt(256);
int ip3=random.nextInt(256);
int ip4=random.nextInt(256);
String ip=ip1+"."+ip2+"."+ip3+"."+ip4;
data.setIp(ip);
String[] address={"北京", "天津", "上海", "重庆", "河北", "辽宁","山西","吉林", "江苏", "浙江", "黑龙江", "安徽", "福建", "江西","山东", "河南", "湖北", "湖南", "广东", "海南", "四川","贵州", "云南", "山西", "甘肃", "青海"};
int n=random.nextInt(address.length);
data.setAddress(address[n]);
Date date=new Date();
SimpleDateFormat dateFormat=new SimpleDateFormat("yyyy-MM-dd");
data.setDate(dateFormat.format(date));
data.setTimestamp(date.getTime());
/*UUID uuid = UUID.randomUUID();
data.setUuid(uuid);*/
int uid=random.nextInt(1000);
String uuid="10"+uid;
data.setUuid(uuid);
String[] port={"www.baidu.com", "www.taobao.com", "www.dangdang.com", "www.jd.com", "www.suning.com", "www.mi.com", "www.gome.com.cn"};
int pn=random.nextInt(port.length);
data.setPort(port[pn]);
String[] method={"Regist", "Comment", "View", "Login", "Buy", "Click", "Logout"};
int mn=random.nextInt(method.length);
data.setMethod(method[mn]);
datas.add(data);
}
write(datas);
}
public static void write(List<data>datas){
StringBuffer sb=new StringBuffer();
for (data c:datas){
sb.append(c.getIp()+"\t"+c.getAddress()+"\t"+c.getDate()+"\t"+c.getTimestamp()+"\t"+c.getUuid()+"\t"+c.getPort()+"\t"+c.getMethod()+"\n");
}
File file=new File("E:/newdata.txt");
FileOutputStream fos=null;
OutputStreamWriter osw=null;
BufferedWriter bw=null;
try {
file.createNewFile();
fos=new FileOutputStream(file);
osw=new OutputStreamWriter(fos,"utf-8");
bw=new BufferedWriter(osw);
bw.write(sb.toString());
} catch (Exception e) {
e.printStackTrace();
}finally {
if(bw!=null){
try {
bw.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if(osw!=null){
try {
osw.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if(fos!=null){
try {
fos.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
需要完成任务
1.pv pageview 得出每个网站总的访问量
2.uv unique vistor 得出每个网站不同ip的访问量
3.每个网址 访问量top3地区 和对应的人数
www.baidu.com 北京 2000
河北 1800
辽宁 1000
www.taobao.com....
java测试spark环境所需pom文件,以及建立工程项目参考
object test {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
//设置应用名称
conf.setAppName("word_count")
//设置Spark运行的模式
conf.setMaster("local")
val sc = new SparkContext(conf)
val rdd1 = sc.textFile("E:/newdata.txt")
//第一题
/*val rdd2=rdd1.map(line=>{
(line.split("\t")(5),1)
}).reduceByKey(_+_).sortBy(_._2,true).foreach(println)*/
/*解释:文件内容读取到rdd后,通过map以\t分割取第六个数据(即网站(按行读取进行map运算))以及进行1计数,之后通过key进行
累加计算,最后通过sortby即value值即运算结果值,大小排序,输出
如图*/

//第二题
/*val rdd2 = rdd1.map(line => {
(line.split("\t")(5), line.split("\t")(0))
}).distinct().countByKey().foreach(println)*/
/*第二题是通过map划分为(String,String)格式的,然后去重相同的,即去掉ip重复的,通过key值即网站进行计数,最后输出
*/

数据的不完整性,和第一题不产生区别,此时将以下两个数据ip和网站设置为相同
改为
此时预计结果为,www.mi.com结果会少一个
//第三题
rdd1.map(line=>{
((line.split("\t")(5),line.split("\t")(1)),line.split("\t")(0))
}).distinct().countByKey().groupBy(_._1._1).map(one=>{
one._2.toSeq.sortBy(_._2).takeRight(3)
}).foreach(println)
/*大致和以上两个一样,不同的是通过map取值时是((String,String),String)格式,去重后计数,再通过groupby排序
由于是按照第一个数的第一个值排序,groupby后格式为

如上图格式,因此再通过map,取其中的第二个值(即把全部提前,因为提取键做第一个,整体做第二个部分),进行toseq或者toList
否则无法sortby,最后通过takeright取倒数三个(因为排序为默认从小到大排序)(take取前n个,drop去掉前n个,first取第一个)
*/

sc.stop();
}
}
原文:https://www.cnblogs.com/pursue339/p/10611056.html