Spark中DataFrame去除NaN、null以及空字符串数据

mac2025-06-01  5

去除null、NaN:

//创建sparkSession(打包在集群上运行要删除master) val sparkConf: SparkConf = new SparkConf().setAppName(this.getClass.getSimpleName.filter(!_.equals('$'))) //获取sparkContext val sparkContext = new SparkContext(sparkConf) //设置日志级别 sparkContext.setLogLevel("WARN") //获取sqlContext val spark: SQLContext = new SQLContext(sparkContext) //读取数据 val data: DataFrame = spark.read.format("jdbc") .option("url", "jdbc:mysql://10.213.111.XXX:23306/buf_amr_all") .option("dbtable", "pdwqy_pms_yx_sbxx") .option("user", "qjjc") .option("password", "XXX") .load() //去除null和NaN data.na.drop().show()

去掉空字符串:

//创建sparkSession(打包在集群上运行要删除master) val sparkConf: SparkConf = new SparkConf().setAppName(this.getClass.getSimpleName.filter(!_.equals('$'))) //获取sparkContext val sparkContext = new SparkContext(sparkConf) //设置日志级别 sparkContext.setLogLevel("WARN") //获取sqlContext val spark: SQLContext = new SQLContext(sparkContext) //读取数据 val data: DataFrame = spark.read.format("jdbc") .option("url", "jdbc:mysql://10.213.111.XXX:23306/buf_amr_all") .option("dbtable", "pdwqy_pms_yx_sbxx") .option("user", "qjjc") .option("password", "XXX") .load() //去掉空字符串 data.where("sentence <> ''").show()
最新回复(0)