【spark 数据框】Spark数据框dataFrame

%spark
/*DataFrame*/
val data=sc.parallelize(List(("Alice",21),("Bob",24))) 
val people=data.toDF("name","age")
people.show()

/*增加一列*/
val df1=(1 to 5).map(i=>(i,i*2)).toDF("single","double")
df1.show()

/*筛选数据框*/
val df2=df1.filter($"double">6)
df2.show()

/*合并数据框*/
val df3=df1.unionAll(df2)
df3.show()

/*join*/
val df4=df1.join(df2)
df4.show()

/*inner join*/
val df5=df1.join(df2,df1("single")===df2("single"),"inner")
df5.show()


/*重命名数据框*/
val df6=df5.toDF("V1","V2","V3","V4")
df6.show()



data: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[23208] at parallelize at <console>:27
people: org.apache.spark.sql.DataFrame = [name: string, age: int]
+-----+---+
| name|age|
+-----+---+
|Alice| 21|
|  Bob| 24|
+-----+---+

df1: org.apache.spark.sql.DataFrame = [single: int, double: int]
+------+------+
|single|double|
+------+------+
|     1|     2|
|     2|     4|
|     3|     6|
|     4|     8|
|     5|    10|
+------+------+

df2: org.apache.spark.sql.DataFrame = [single: int, double: int]
+------+------+
|single|double|
+------+------+
|     4|     8|
|     5|    10|
+------+------+

df3: org.apache.spark.sql.DataFrame = [single: int, double: int]
+------+------+
|single|double|
+------+------+
|     1|     2|
|     2|     4|
|     3|     6|
|     4|     8|
|     5|    10|
|     4|     8|
|     5|    10|
+------+------+

df4: org.apache.spark.sql.DataFrame = [single: int, double: int, single: int, double: int]
+------+------+------+------+
|single|double|single|double|
+------+------+------+------+
|     1|     2|     4|     8|
|     1|     2|     5|    10|
|     2|     4|     4|     8|
|     2|     4|     5|    10|
|     3|     6|     4|     8|
|     3|     6|     5|    10|
|     4|     8|     4|     8|
|     4|     8|     5|    10|
|     5|    10|     4|     8|
|     5|    10|     5|    10|
+------+------+------+------+

df5: org.apache.spark.sql.DataFrame = [single: int, double: int, single: int, double: int]
+------+------+------+------+
|single|double|single|double|
+------+------+------+------+
|     4|     8|     4|     8|
|     5|    10|     5|    10|
+------+------+------+------+

df6: org.apache.spark.sql.DataFrame = [V1: int, V2: int, V3: int, V4: int]
+---+---+---+---+
| V1| V2| V3| V4|
+---+---+---+---+
|  4|  8|  4|  8|
|  5| 10|  5| 10|
+---+---+---+---+
已标记关键词 清除标记
©️2020 CSDN 皮肤主题: 编程工作室 设计师:CSDN官方博客 返回首页