去除null、NaN:
//创建sparkSession
(打包在集群上运行要删除master
)
val sparkConf: SparkConf
= new SparkConf
().setAppName
(this.getClass.getSimpleName.filter
(!_.equals
('$')))
//获取sparkContext
val sparkContext
= new SparkContext
(sparkConf
)
//设置日志级别
sparkContext.setLogLevel
("WARN")
//获取sqlContext
val spark: SQLContext
= new SQLContext
(sparkContext
)
//读取数据
val data: DataFrame
= spark.read.format
("jdbc")
.option
("url",
"jdbc:mysql://10.213.111.XXX:23306/buf_amr_all")
.option
("dbtable",
"pdwqy_pms_yx_sbxx")
.option
("user",
"qjjc")
.option
("password",
"XXX")
.load
()
//去除null和NaN
data.na.drop
().show
()
去掉空字符串:
//创建sparkSession
(打包在集群上运行要删除master
)
val sparkConf: SparkConf
= new SparkConf
().setAppName
(this.getClass.getSimpleName.filter
(!_.equals
('$')))
//获取sparkContext
val sparkContext
= new SparkContext
(sparkConf
)
//设置日志级别
sparkContext.setLogLevel
("WARN")
//获取sqlContext
val spark: SQLContext
= new SQLContext
(sparkContext
)
//读取数据
val data: DataFrame
= spark.read.format
("jdbc")
.option
("url",
"jdbc:mysql://10.213.111.XXX:23306/buf_amr_all")
.option
("dbtable",
"pdwqy_pms_yx_sbxx")
.option
("user",
"qjjc")
.option
("password",
"XXX")
.load
()
//去掉空字符串
data.where
("sentence <> ''").show
()