SparkConf sparkConf = new SparkConf();
sparkConf
.setAppName("Internal_Func")
.setMaster("local");
JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
SQLContext sqlContext = new SQLContext(javaSparkContext);
List<String> list = new ArrayList<String>();
list.add("1,1");
list.add("2,11");
list.add("2,111");
list.add("2,111");
list.add("3,1111");
list.add("3,11111");
JavaRDD<String> rdd_str = javaSparkContext.parallelize(list, 5);
JavaRDD<Row> rdd_row = rdd_str.map(new Function<String, Row>() {
@Override
public Row call(String v1) throws Exception {
String ary[] = v1.split(",");
return RowFactory.create(ary[0], Long.parseLong(ary[1]));
}
});
List<StructField> fieldList = new ArrayList<StructField>();
fieldList.add(DataTypes.createStructField("name", DataTypes.StringType, true));
fieldList.add(DataTypes.createStructField("sc", DataTypes.LongType, true));
StructType tmp = DataTypes.createStructType(fieldList);
DataFrame df = sqlContext.createDataFrame(rdd_row, tmp);
df.registerTempTable("tmp_sc");
DataFrame df_agg = sqlContext.sql("select name,count(distinct(sc)) from tmp_sc group by name");//去重后分组求和统计
df_agg.show();