// Create a DataFrame from reading a CSV file
val dfTags = spark
.read
.option("header", "true")
.option("inferSchema", "true")
.csv("/data/question_tags_10K.csv")
.toDF("id", "tag")
// Show 3 rows
dfTags.show(3)
// Create a DataFrame by reading the parquet file
val dfQuestionsCSV = spark
.read
.option("header", "true")
.option("inferSchema", "true")
.option("dateFormat","yyyy-MM-dd HH:mm:ss")
.csv("/data/questions_10K.csv")
.toDF("id", "creation_date", "closed_date", "deletion_date", "score", "owner_userid", "answer_count")
// Show 3 rows
dfQuestionsCSV.show(3)
// Join dfQuestionsCSV with dfTags
val dfQuestions = dfQuestionsCSV.join(dfTags, "id")
// Show 3 rows
dfQuestions.show(3)
// Print the schema in a tree format
dfQuestions.printSchema()
// Create a temporary view so that we can use SQL
dfQuestions.createOrReplaceTempView("so_questions")
// Select the first row
spark.sql("SELECT * FROM so_questions LIMIT 1").show()
// Select the id, score and owner_userid columns
spark.sql("SELECT id, score, owner_userid FROM so_questions LIMIT 2").show()
// Count the number of tags
spark.sql("SELECT COUNT(DISTINCT tag) FROM so_questions").show()
// Select questions with score > 400
spark.sql("SELECT id, score, owner_userid FROM so_questions WHERE score > 400 LIMIT 10").show()
// Select questions with score > 400 and sort by score (descending)
spark.sql("SELECT id, score, owner_userid FROM so_questions WHERE score > 400 ORDER BY score DESC LIMIT 10").show()
// Select the top 10 most viewed questions
spark.sql("SELECT id, view_count FROM so_questions ORDER BY view_count DESC LIMIT 10").show()
// Select the top 10 most viewed questions and their tags
spark.sql("SELECT id, view_count, tags FROM so_questions ORDER BY view_count DESC LIMIT 10").show()
For further actions, you may consider blocking this person and/or reporting abuse
Top comments (0)