From f13555fc0cce1f57efcf21b3a69cf409707388a8 Mon Sep 17 00:00:00 2001 From: myblackbeard Date: Tue, 20 Apr 2021 13:43:41 +0200 Subject: [PATCH] fig bug: update JSON to Parquet serialization JSON format was still in use and prevented Athena from working. Parquet needs to be used, since that's the format used in Firehose when record format conversion was enabled. --- cloudformation/glue.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cloudformation/glue.yml b/cloudformation/glue.yml index ad0bac0..b6805c6 100644 --- a/cloudformation/glue.yml +++ b/cloudformation/glue.yml @@ -40,12 +40,12 @@ Resources: - - "s3://" - !Ref pBucketName - "/raw_reddit_comments/" - InputFormat: org.apache.hadoop.mapred.TextInputFormat - OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + InputFormat: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat + OutputFormat: org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat BucketColumns: [] SortColumns: [] SerdeInfo: - SerializationLibrary: org.openx.data.jsonserde.JsonSerDe + SerializationLibrary: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe Parameters: serialization.format: '1' StoredAsSubDirectories: false