Script working (seemingly fine)

95eb843f · serpucga · dcc81cb0 · 95eb843f
Commit 95eb843f authored Jul 11, 2019 by serpucga
Hide whitespace changes
Inline Side-by-side

Showing with 73 additions and 30 deletions

pymongoexport_csv.py pymongoexport_csv.py +73 -30

No files found.
--- a/pymongoexport_csv.py
+++ b/pymongoexport_csv.py
@@ -5,12 +5,17 @@ import os
 import argparse
 import json
 import re
+import datetime
+from email.utils import parsedate
 from tweet_manager.lib import json2csv, format_csv


+def parse_datetime(string):
+    return datetime.datetime(*(parsedate(string)[:6]))
+
+
 def create_task_database_structure(
-        output_dir: str,
-        db_name: str)\
+        output_dir: str)\
        -> str:
    """
    Generate the following directory tree: a top dir that will contain
@@ -19,6 +24,7 @@ def create_task_database_structure(
    """

    # Create the root directory for the tweet collection
+    (output_dir, db_name) = os.path.split(output_dir)
    if not os.path.isdir(output_dir):
        print(
            "Building directory to contain the collected tweets at: "
@@ -67,12 +73,35 @@ def add_newfile_to_metadata(file_path: str, metadata_path: str) -> None:
        add_newfile_to_metadata(file_path, metadata_path)


+def increase_metadata_count(
+        metadata_path: str,
+        file_path: str,
+        increase: int = 1)\
+        -> None:
+    """
+    Use this when one tweet is appended to one of the CSVs in the
+    collection. This function will update the metadata file by increasing
+    by x the corresponding dictionary structure
+    """
+
+    print("Executing increase_metadata_count")
+    try:
+        with open(metadata_path, "r+") as f:
+            metadata_file = json.load(f)
+            metadata_file["files"][file_path]["count"] += increase
+            f.seek(0)
+            f.truncate()
+            json.dump(metadata_file, f)
+    except IOError:
+        generate_metadata_file(metadata_path)
+        increase_metadata_count(metadata_path, file_path, increase)
+
+
 def create_tweet_output_path(
        tweet: dict,
-        output_dir: str,
-        db_name: str)\
+        output_dir: str)\
        -> str:
-    collection_path = create_task_database_structure(output_dir, db_name)
+    collection_path = create_task_database_structure(output_dir)

    # Extract year, month and date from the tweet using a regex
    matchObj = re.search(
@@ -99,23 +128,20 @@ def create_tweet_output_path(

    return tweet_output_file

-# 
-# def convert_tweet_to_csv(tweet: dict) -> str:
-#     # Flatten the tweet and store it in status_flat
-#     status_flat = json2csv.flatten_dictionary(tweet)
-# 
-#     # Convert the flat JSON to CSV format
-#     # 1st arg: flat tweet, 2nd arg: activate array compression, 3rd arg:
-#     # number of array compression levels, 4th arg: remove dollars mode
-#     status_csv = json2csv.json2csv(status_flat, True, 5, False)
-# 
-#     # Get the default header with the fields to keep
-#     with open(config.CSV_HEADER) as f:
-#         header = f.readline()
-# 
-#     csv_appendable_line = format_csv.get_csv_line(header, status_csv)
-# 
-#     return csv_appendable_line
+
+def convert_tweet_to_csv(header: str, tweet: dict) -> str:
+    # Flatten the tweet and store it in status_flat
+    status_flat = json2csv.flatten_dictionary(tweet)
+
+    # Convert the flat JSON to CSV format
+    # 1st arg: flat tweet, 2nd arg: activate array compression, 3rd arg:
+    # number of array compression levels, 4th arg: remove dollars mode
+    status_csv = json2csv.json2csv(status_flat, True, 5, False)
+
+    csv_appendable_line = format_csv.get_csv_line(header, status_csv)
+
+    return csv_appendable_line
+

 if __name__ == '__main__':

@@ -129,7 +155,7 @@ if __name__ == '__main__':

    # Dirs and files
    script_dir = os.path.dirname(__file__)
-    output_dir = os.path.join(script_dir, "pymongodump")
+    output_dir = os.path.join(script_dir, "pymongodump", args.database)
    header_file = os.path.join(script_dir, "header.txt")

    # MongoDB connection
@@ -138,11 +164,28 @@ if __name__ == '__main__':

    with open(header_file) as f:
        header = f.readline()
-    
-    for tweet in database_tweets.find():
-        create_tweet_output_path(tweet, output_dir, args.database)

-        # flat_tweet = json2csv.flatten_dictionary(tweet)
-        # csv_rawline = json2csv.json2csv(flat_tweet, True, 5, False)
-        # csv_appendable_line =\
-        #     format_csv.get_csv_line(header, csv_rawline)
+    buffer_tweets = {}
+    for tweet in database_tweets.find():
+        # Get output path and contents for the new CSV file
+        csv_tweet_output_path =\
+            create_tweet_output_path(tweet, output_dir)
+        csv_tweet_contents =\
+            "\n" + str(convert_tweet_to_csv(header, tweet))
+
+        # Check if buffer exists for the file. If not, add to dictionary
+        if csv_tweet_output_path not in buffer_tweets.keys():
+            buffer_tweets[csv_tweet_output_path] = ["", 0]
+
+        # Update the buffer adding the tweet and increasing tweet count
+        buffer_tweets[csv_tweet_output_path][0] += csv_tweet_contents
+        buffer_tweets[csv_tweet_output_path][1] += 1
+
+    # Perform the write operations in each of the files
+    for output_path in buffer_tweets.keys():
+        with open(output_path, "a") as tweet_writer:
+            tweet_writer.write(buffer_tweets[output_path][0])
+            increase_metadata_count(
+                os.path.join(output_dir, ".metadata.json"),
+                output_path, increase=buffer_tweets[output_path][1]
+            )