First parallel version of the code

Parallelized using multiprocessing library. I'm not really sure about the code being thread safe. I think we don't care if tweets are appended to the files in a different order, but the metadata files being corrupted would be problematic. In the first tests the metadata were fine, but I think this line is probably not thread safe (two threads could load try to update the old value at the same time, resulting in inconsistencies): """ metadata_file["files"][file_path]["count"] += increase """ Apart from that, code is much faster than before.

First parallel version of the code
ed2c9d74 · serpucga · 34776b63 · ed2c9d74 · ed2c9d74
Commit ed2c9d74 authored Jul 15, 2019 by serpucga
Show whitespace changes
Inline Side-by-side

Showing with 40 additions and 23 deletions

utils.py lib/utils.py +4 -1

pymongoexport_csv.py pymongoexport_csv.py +36 -22

No files found.
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -33,7 +33,7 @@ def create_task_database_structure(

 def generate_metadata_file(metadata_path) -> None:
    print("Executing generate_metadata_file")
-        file_metadata = {}  # type: Dict
+    file_metadata = {}
    metadata = {}
    metadata["files"] = file_metadata

@@ -92,7 +92,10 @@ def create_tweet_output_path(
        tweet: dict,
        output_dir: str)\
        -> str:
+    try:
        collection_path = create_task_database_structure(output_dir)
+    except FileExistsError as e:
+        collection_path = e.filename

    # Extract year, month and date from the tweet using a regex
    matchObj = re.search(

--- a/pymongoexport_csv.py
+++ b/pymongoexport_csv.py
@@ -3,6 +3,7 @@
 import pymongo
 import os
 import argparse
+import multiprocessing as mp
 from math import ceil

 from lib import utils
@@ -20,6 +21,33 @@ def get_tweets_page(collection, page_size: int, num_page: int):
    return tweets


+def write_tweets_to_files(header: str, output_dir: str, tweets_page):
+    print("Hi there! write_tweets_to_files executing")
+    buffer_tweets = {}
+    for tweet in tweets_page:
+        # Get output path and contents for the new CSV file
+        csv_tweet_output_path =\
+            utils.create_tweet_output_path(header, tweet, output_dir)
+        csv_tweet_contents =\
+            "\n" + str(utils.convert_tweet_to_csv(header, tweet))
+
+        # Check if buffer exists for the file. If not, add to dictionary
+        if csv_tweet_output_path not in buffer_tweets.keys():
+            buffer_tweets[csv_tweet_output_path] = ["", 0]
+
+        # Update the buffer adding the tweet and increasing tweet count
+        buffer_tweets[csv_tweet_output_path][0] += csv_tweet_contents
+        buffer_tweets[csv_tweet_output_path][1] += 1
+
+    # Perform the write operations in each of the files
+    for output_path in buffer_tweets.keys():
+        with open(output_path, "a") as tweet_writer:
+            tweet_writer.write(buffer_tweets[output_path][0])
+            utils.increase_metadata_count(
+                os.path.join(output_dir, ".metadata.json"),
+                output_path, increase=buffer_tweets[output_path][1])
+
+
 if __name__ == "__main__":

    # Command line parsing
@@ -46,27 +74,13 @@ if __name__ == "__main__":
    num_page = 0

    page_index = get_page_index(database_tweets, args.pagesize)
-    for page in page_index:
-        buffer_tweets = {}
-        for tweet in page:
-            # Get output path and contents for the new CSV file
-            csv_tweet_output_path =\
-                utils.create_tweet_output_path(header, tweet, output_dir)
-            csv_tweet_contents =\
-                "\n" + str(utils.convert_tweet_to_csv(header, tweet))
-
-            # Check if buffer exists for the file. If not, add to dictionary
-            if csv_tweet_output_path not in buffer_tweets.keys():
-                buffer_tweets[csv_tweet_output_path] = ["", 0]

-            # Update the buffer adding the tweet and increasing tweet count
-            buffer_tweets[csv_tweet_output_path][0] += csv_tweet_contents
-            buffer_tweets[csv_tweet_output_path][1] += 1
+    output = mp.Queue()
+    processes = (mp.Process(
+        target=write_tweets_to_files, args=(
+            header, output_dir, page)) for page in page_index)

-        # Perform the write operations in each of the files
-        for output_path in buffer_tweets.keys():
-            with open(output_path, "a") as tweet_writer:
-                tweet_writer.write(buffer_tweets[output_path][0])
-                utils.increase_metadata_count(
-                    os.path.join(output_dir, ".metadata.json"),
-                    output_path, increase=buffer_tweets[output_path][1])
+    for p in processes:
+        p.start()
+    for p in processes:
+        p.join()