Hackerss.com

hackerss
hackerss

Posted on • Updated on

Java Concatenate two dataframes


// Create a DataFrame from reading a CSV file
val dfTags = spark
  .read
  .option("header", "true")
  .option("inferSchema", "true")
  .csv("/data/question_tags_10K.csv")
  .toDF("id", "tag")

// Show 3 rows
dfTags.show(3)

// Create a DataFrame by reading the parquet file
val dfQuestionsCSV = spark
  .read
  .option("header", "true")
  .option("inferSchema", "true")
  .option("dateFormat","yyyy-MM-dd HH:mm:ss") 
  .csv("/data/questions_10K.csv")
  .toDF("id", "creation_date", "closed_date", "deletion_date", "score", "owner_userid", "answer_count")

// Show 3 rows
dfQuestionsCSV.show(3)

// Join dfQuestionsCSV with dfTags
val dfQuestions = dfQuestionsCSV.join(dfTags, "id")

// Show 3 rows
dfQuestions.show(3)

// Print the schema in a tree format
dfQuestions.printSchema()

// Create a temporary view so that we can use SQL
dfQuestions.createOrReplaceTempView("so_questions")

// Select the first row
spark.sql("SELECT * FROM so_questions LIMIT 1").show()

// Select the id, score and owner_userid columns
spark.sql("SELECT id, score, owner_userid FROM so_questions LIMIT 2").show()

// Count the number of tags
spark.sql("SELECT COUNT(DISTINCT tag) FROM so_questions").show()

// Select questions with score > 400
spark.sql("SELECT id, score, owner_userid FROM so_questions WHERE score > 400 LIMIT 10").show()

// Select questions with score > 400 and sort by score (descending)
spark.sql("SELECT id, score, owner_userid FROM so_questions WHERE score > 400 ORDER BY score DESC LIMIT 10").show()

// Select the top 10 most viewed questions
spark.sql("SELECT id, view_count FROM so_questions ORDER BY view_count DESC LIMIT 10").show()

// Select the top 10 most viewed questions and their tags
spark.sql("SELECT id, view_count, tags FROM so_questions ORDER BY view_count DESC LIMIT 10").show()

Enter fullscreen mode Exit fullscreen mode

Top comments (0)