Exporting to JSON and to CSV separated. Implemented creation of filesystem tree to store the CSVs

dcc81cb0 · serpucga · 56c27157 · dcc81cb0 · dcc81cb0 · dcc81cb0
Commit dcc81cb0 authored Jul 11, 2019 by serpucga
Showing with 191 additions and 0 deletions

header.txt header.txt +1 -0

pymongoexport_csv.py pymongoexport_csv.py +148 -0

pymongoexport_json.py pymongoexport_json.py +2 -0

requirements.txt requirements.txt +1 -0

utils.py utils.py +39 -0

No files found.
--- a/header.txt
+++ b/header.txt
+id,text,created_at,source,truncated,in_reply_to_status_id,in_reply_to_user_id,in_reply_to_screen_name,quoted_status_id,is_quote_status,retweet_count,favorite_count,user.id,user.name,user.created_at,user.screen_name,user.location,user.profile_image_url,user.verified,user.followers_count,user.friends_count,user.listed_count,user.favourites_count,user.statuses_count,user.geo_enabled,user.lang,entities.hashtags.text,entities.urls.expanded_url,entities.user_mentions.screen_name,entities.media.media_url,place.id,place.name,place.full_name,place.country,place.country_code,place.place_type,place.url,place.bounding_box.type,place.bounding_box.coordinates,coordinates.type,coordinates.coordinates
--- a/pymongoexport_csv.py
+++ b/pymongoexport_csv.py
+#!/usr/bin/env python
+
+import pymongo
+import os
+import argparse
+import json
+import re
+from tweet_manager.lib import json2csv, format_csv
+
+
+def create_task_database_structure(
+        output_dir: str,
+        db_name: str)\
+        -> str:
+    """
+    Generate the following directory tree: a top dir that will contain
+    all the tweet collections if it didn't exist yet and within it the top
+    directory for this task with a new and empty metadata file
+    """
+
+    # Create the root directory for the tweet collection
+    if not os.path.isdir(output_dir):
+        print(
+            "Building directory to contain the collected tweets at: "
+            + os.path.abspath(output_dir)
+        )
+        os.mkdir(output_dir)
+    collection_path = os.path.join(output_dir, db_name)
+    if not os.path.isdir(collection_path):
+        print("Initializing collection " + db_name + "...")
+        os.mkdir(collection_path)
+        metadata_path = os.path.join(collection_path, ".metadata.json")
+        generate_metadata_file(metadata_path)
+
+    return collection_path
+
+
+def generate_metadata_file(metadata_path) -> None:
+        print("Executing generate_metadata_file")
+        file_metadata = {}  # type: Dict
+        metadata = {}
+        metadata["files"] = file_metadata
+
+        with open(metadata_path, "w") as f:
+            json.dump(metadata, f)
+
+
+def add_newfile_to_metadata(file_path: str, metadata_path: str) -> None:
+    """
+    Add a new dictionary structure to the metadata file that contains
+    information about a newly added CSV. This should just be user for files
+    that have just been added to the collection, because it initializes the
+    count to 0
+    """
+
+    print("Executing add_newfile_to_metadata")
+    try:
+        with open(metadata_path, "r+") as f:
+            metadata_file = json.load(f)
+            metadata_file["files"][file_path] = {}
+            metadata_file["files"][file_path]["count"] = 0
+            f.seek(0)
+            f.truncate()
+            json.dump(metadata_file, f)
+    except IOError:
+        generate_metadata_file(metadata_path)
+        add_newfile_to_metadata(file_path, metadata_path)
+
+
+def create_tweet_output_path(
+        tweet: dict,
+        output_dir: str,
+        db_name: str)\
+        -> str:
+    collection_path = create_task_database_structure(output_dir, db_name)
+
+    # Extract year, month and date from the tweet using a regex
+    matchObj = re.search(
+        r"^(\d{4})-(\d{2})-(\d{2}) \d{2}:\d{2}:\d{2}$", str(tweet["created_at"])
+    )
+    year = matchObj.group(1)
+    month = matchObj.group(2)
+    day = matchObj.group(3)
+    date = (year, month, "")
+
+    # Classify the tweet chronologically
+    tweet_output_path = json2csv.mkdir_tweet_date(date, collection_path)
+    tweet_output_file = os.path.join(tweet_output_path, day + ".csv")
+
+    # If the CSV file didn't already exist, initialize it with a header
+    if os.path.isfile(tweet_output_file) is False:
+        with open(header_file) as f:
+            header = f.readline().strip()
+            with open(tweet_output_file, "w") as fw:
+                fw.write(header)
+        add_newfile_to_metadata(
+            tweet_output_file,
+            os.path.join(collection_path, ".metadata.json"))
+
+    return tweet_output_file
+
+# 
+# def convert_tweet_to_csv(tweet: dict) -> str:
+#     # Flatten the tweet and store it in status_flat
+#     status_flat = json2csv.flatten_dictionary(tweet)
+# 
+#     # Convert the flat JSON to CSV format
+#     # 1st arg: flat tweet, 2nd arg: activate array compression, 3rd arg:
+#     # number of array compression levels, 4th arg: remove dollars mode
+#     status_csv = json2csv.json2csv(status_flat, True, 5, False)
+# 
+#     # Get the default header with the fields to keep
+#     with open(config.CSV_HEADER) as f:
+#         header = f.readline()
+# 
+#     csv_appendable_line = format_csv.get_csv_line(header, status_csv)
+# 
+#     return csv_appendable_line
+
+if __name__ == '__main__':
+
+    # Command line parsing
+    parser = argparse.ArgumentParser(
+        description="Dump the tweets of a database to a JSON file")
+    parser.add_argument("-H", "--host", type=str, default="localhost")
+    parser.add_argument("-p", "--port", type=int, default=27017)
+    parser.add_argument("database", type=str)
+    args = parser.parse_args()
+
+    # Dirs and files
+    script_dir = os.path.dirname(__file__)
+    output_dir = os.path.join(script_dir, "pymongodump")
+    header_file = os.path.join(script_dir, "header.txt")
+
+    # MongoDB connection
+    client = pymongo.MongoClient(args.host, args.port)
+    database_tweets = client[args.database]["tweets"]
+
+    with open(header_file) as f:
+        header = f.readline()
+    
+    for tweet in database_tweets.find():
+        create_tweet_output_path(tweet, output_dir, args.database)
+
+        # flat_tweet = json2csv.flatten_dictionary(tweet)
+        # csv_rawline = json2csv.json2csv(flat_tweet, True, 5, False)
+        # csv_appendable_line =\
+        #     format_csv.get_csv_line(header, csv_rawline)
--- a/pymongoexport.py
+++ b/pymongoexport.py
@@ -3,6 +3,8 @@
 import pymongo
 import os
 import argparse
+import pprint
+from tweet_manager.lib.json2csv import flatten

 parser = argparse.ArgumentParser(
    description="Dump the tweets of a database to a JSON file")

--- a/requirements.txt
+++ b/requirements.txt
 pymongo==3.4.0
+tweetmanager-serpucga==1.1.7
--- a/utils.py
+++ b/utils.py
+import os
+import json
+
+
+def create_task_database_structure(
+        output_dir: str,
+        db_name: str)\
+        -> str:
+    """
+    Generate the following directory tree: a top dir that will contain
+    all the tweet collections if it didn't exist yet and within it the top
+    directory for this task with a new and empty metadata file
+    """
+
+    # Create the root directory for the tweet collection
+    if not os.path.isdir(output_dir):
+        print(
+            "Building directory to contain the collected tweets at: "
+            + os.path.abspath(output_dir)
+        )
+        os.mkdir(output_dir)
+    collection_path = os.path.join(output_dir, db_name)
+    if not os.path.isdir(collection_path):
+        print("Initializing collection " + db_name + "...")
+        os.mkdir(collection_path)
+        generate_metadata_file(collection_path)
+
+    return collection_path
+
+
+def generate_metadata_file(collection_path) -> None:
+        print("Executing generate_metadata_file")
+        metadata_path = os.path.join(collection_path, ".metadata.json")
+        file_metadata = {}  # type: Dict
+        metadata = {}
+        metadata["files"] = file_metadata
+
+        with open(metadata_path, "w") as f:
+            json.dump(metadata, f)