一、用Sparkcontext读入文件,map逐行用Gson解析,输出转成一个caseclass类,填充各字段,输出。
解析JSON这里没有什么问题。
RDD覆盖写的时候碰到了一些问题 :
1.直接saveAsTextFile没有覆盖true参数;
2.转dataframe时,还得一个一个字段显化才能转成dataframe;
3.write时,一开始打算写text,说字段里不能含有long和int,换成string后,说不能有多个字段,只能有一个string。刚好用parquet存储省空间,就存parquet了。
跑通代码如下:文章来源:https://www.toymoban.com/news/detail-817176.html
package XXX
import com.google.gson.JsonParser
import org.apache.spark.sql.SparkSession
import schema.caseClass1
object ParsecaseClass1Json {
def main(args: Array[String]): Unit = {
val inputPath = args(0)
val outputPath = args(1)
val sparkSession = SparkSession.builder().appName(this.getClass.getSimpleName).getOrCreate()
import sparkSession.implicits._
val lines = sparkSession.sparkContext.textFile(inputPath)
val result = lines.map(=> parseJsonStr(str))
val df = result.map(
x => (
x.adId
, x.campaignId
, x.settlementType
, x.billingType
, x.billingTypeCode
, x.packageName
))
.toDF()
df.coalesce(1).write.format("parquet").mode("overwrite").save(outputPath)
}
def parseJsonStr(str: String): caseClass1 = {
val inputJson = new JsonParser().parse(str).getAsJsonObject
val object = new caseClass1
//1.campaignId
if (inputJson.has("campaign")) {
val campaign = inputJson.getAsJsonObject("campaign")
var campaignId: Long = 0
if (campaign.has("id"))
campaignId = campaign.getAsJsonPrimitive("id").getAsLong
else if (campaign.has("campaignId")) {
campaignId = campaign.getAsJsonPrimitive("campaignId").getAsLong
}
else {
System.err.println("No campaignId, inputJson: {}", str)
}
object.campaignId = campaignId
}
//2.creativeId
if (inputJson.has("creative")) {
val creative = inputJson.getAsJsonObject("creative")
var adId: Long = 0
if (creative.has("id"))
adId = creative.getAsJsonPrimitive("id").getAsLong
else if (creative.has("creativeId"))
adId = creative.getAsJsonPrimitive("creativeId").getAsLong
else
System.err.println("No adId, inputJson: {}", str)
object.adId = adId
}
/*
3.settlementType
4.billingType
5.billingTypeCode
6.appId -> packageName
*/
if (inputJson.has("group")) {
val group = inputJson.getAsJsonObject("group")
object.settlementType = group.getAsJsonPrimitive("settlementType").getAsString
object.billingType = group.getAsJsonPrimitive("billingType").getAsString
object.billingTypeCode = group.getAsJsonPrimitive("billingTypeCode").getAsInt
object.packageName = group.getAsJsonPrimitive("appId").getAsString
}
System.err.println(object.toString)
object
}
}
caseClass如下文章来源地址https://www.toymoban.com/news/detail-817176.html
package XXX;
import java.io.Serializable;
public class caseClass1 implements Serializable {
private static final long serialVersionUID = ***;
public long adId = 0;
public long campaignId = 0;
public String settlementType;
public String billingType;
public Integer billingTypeCode;
public String packageName;
@Override
public String toString() {
return "caseClass1{" +
"adId=" + adId +
", campaignId=" + campaignId +
", settlementType='" + settlementType + '\'' +
", billingType='" + billingType + '\'' +
", billingTypeCode=" + billingTypeCode +
", packageName='" + packageName + '\'' +
'}';
}
public caseClass1() {
}
}
到了这里,关于Spark解析JSON文件,写入hdfs的文章就介绍完了。如果您还想了解更多内容,请在右上角搜索TOY模板网以前的文章或继续浏览下面的相关文章,希望大家以后多多支持TOY模板网!