Merge branch 'feature/parallelism' into develop

36d7a65a · serpucga · e093713a · 52dbc5e9 · 36d7a65a · 36d7a65a
Commit 36d7a65a authored Jul 18, 2019 by serpucga
Showing with 71 additions and 93 deletions

.gitignore .gitignore +2 -0

globals.py config/globals.py +1 -0

header.txt config/header.txt +0 -0

utils.py lib/utils.py +0 -0

pymongoexport_csv.py pymongoexport_csv.py +68 -54

utils.py utils.py +0 -39

No files found.
--- a/.gitignore
+++ b/.gitignore
 pymongodump
+tests.py
+.mypy_cache
--- a/config/globals.py
+++ b/config/globals.py
+timing = False
--- a/header.txt
+++ b/header.txt
--- a/lib/utils.py
+++ b/lib/utils.py
--- a/pymongoexport_csv.py
+++ b/pymongoexport_csv.py
@@ -3,69 +3,83 @@
 import pymongo
 import os
 import argparse
+import logging
+import time
+import multiprocessing as mp
+from config import globals
 from lib import utils
+# Command line parsing
+parser = argparse.ArgumentParser(
+    description="Dump the tweets of a database to a JSON file")
+parser.add_argument("-H", "--host", type=str, default="localhost")
+parser.add_argument("-p", "--port", type=int, default=27017)
+parser.add_argument("-s", "--pagesize", type=int, default=1000)
+parser.add_argument("-v", "--verbose", action="store_true")
+parser.add_argument("-t", "--timing", action="store_true")
+parser.add_argument("database", type=str)
+args = parser.parse_args()
-def get_tweets_page(collection, page_size: int, num_page: int):
+# Logging config
-    tweets_page = []
+logformat = "[%(asctime)s] %(message)s"
-    tweets = collection.find().skip(num_page * page_size).limit(page_size)
+dateformat = "%H:%M:%S"
-    for tweet in tweets:
+if args.verbose:
-        tweets_page.append(tweet)
+    logging.basicConfig(
-    return tweets_page
+        level=logging.DEBUG, format=logformat, datefmt=dateformat)
+else:
+    logging.basicConfig(
-if __name__ == "__main__":
+        level=logging.ERROR, format=logformat, datefmt=dateformat)
+logger = logging.getLogger(__name__)
-    # Command line parsing
+# Initialize some variables
-    parser = argparse.ArgumentParser(
+script_dir = os.path.dirname(__file__)
-        description="Dump the tweets of a database to a JSON file")
+output_dir = os.path.join(script_dir, "pymongodump", args.database)
-    parser.add_argument("-H", "--host", type=str, default="localhost")
+header_file = os.path.join(script_dir, "config", "header.txt")
-    parser.add_argument("-p", "--port", type=int, default=27017)
+with open(header_file) as f:
-    parser.add_argument("-s", "--pagesize", type=int, default=1000)
+    header = f.readline()
-    parser.add_argument("database", type=str)
+buffer_tweets = {}
-    args = parser.parse_args()
+task_queue = mp.Queue()
+if args.timing:
+    globals.timing = True
+    time0 = time.time()
-    # Dirs and files
+# MongoDB connection to get page index
-    script_dir = os.path.dirname(__file__)
+client = pymongo.MongoClient(args.host, args.port)
-    output_dir = os.path.join(script_dir, "pymongodump", args.database)
+database_tweets = client[args.database]["tweets"]
-    header_file = os.path.join(script_dir, "header.txt")
+page_index = utils.get_page_index(database_tweets, args.pagesize)
+client.close()
+logger.debug(
+    "Database {} partitioned in {} pages of {} tweets (maximum)"
+    .format(args.database, len(page_index), args.pagesize))
-    # MongoDB connection
-    client = pymongo.MongoClient(args.host, args.port)
-    database_tweets = client[args.database]["tweets"]
-    with open(header_file) as f:
+# Build a picklable function that we can pass to map
-        header = f.readline()
+def process_data_page(
-    buffer_tweets = {}
+        page, host=args.host, port=args.port, database=args.database,
-    num_page = 0
+        pagesize=args.pagesize, header=header, outputdir=output_dir,
+        queue=task_queue):
-    tweets_page = get_tweets_page(database_tweets, args.pagesize, num_page)
+    utils.process_page(
-    while len(tweets_page) != 0:
+        host, port, database, header, output_dir, pagesize, page, queue)
-        buffer_tweets = {}
-        for tweet in tweets_page:
-            # Get output path and contents for the new CSV file
-            csv_tweet_output_path =\
-                utils.create_tweet_output_path(header, tweet, output_dir)
-            csv_tweet_contents =\
-                "\n" + str(utils.convert_tweet_to_csv(header, tweet))
-            # Check if buffer exists for the file. If not, add to dictionary
-            if csv_tweet_output_path not in buffer_tweets.keys():
-                buffer_tweets[csv_tweet_output_path] = ["", 0]
-            # Update the buffer adding the tweet and increasing tweet count
+# Launch single process to write to the filesystem
-            buffer_tweets[csv_tweet_output_path][0] += csv_tweet_contents
+writer_worker = mp.Process(
-            buffer_tweets[csv_tweet_output_path][1] += 1
+    target=utils.filesystem_writer, args=(task_queue, header, ))
+writer_worker.start()
-        # Perform the write operations in each of the files
+# Launch pool of workers to perform the format conversion
-        for output_path in buffer_tweets.keys():
+with mp.Pool() as pool:
-            with open(output_path, "a") as tweet_writer:
+    pool.map(process_data_page, page_index)
-                tweet_writer.write(buffer_tweets[output_path][0])
+task_queue.put("END")
-                utils.increase_metadata_count(
-                    os.path.join(output_dir, ".metadata.json"),
-                    output_path, increase=buffer_tweets[output_path][1])
-        num_page += 1
+if globals.timing:
-        tweets_page =\
+    time1 = time.time()
-            get_tweets_page(database_tweets, args.pagesize, num_page)
+utils.generate_metadata_file(output_dir)
+if globals.timing:
+    logger.critical(
+        "Time spent generating metadata file: {}s"
+        .format(time.time() - time1))
+    logger.critical(
+        "Total execution time: {}s"
+        .format(time.time() - time0))
--- a/utils.py
+++ b/utils.py
-import os
-import json
-def create_task_database_structure(
-        output_dir: str,
-        db_name: str)\
-        -> str:
-    """
-    Generate the following directory tree: a top dir that will contain
-    all the tweet collections if it didn't exist yet and within it the top
-    directory for this task with a new and empty metadata file
-    """
-    # Create the root directory for the tweet collection
-    if not os.path.isdir(output_dir):
-        print(
-            "Building directory to contain the collected tweets at: "
-            + os.path.abspath(output_dir)
-        )
-        os.mkdir(output_dir)
-    collection_path = os.path.join(output_dir, db_name)
-    if not os.path.isdir(collection_path):
-        print("Initializing collection " + db_name + "...")
-        os.mkdir(collection_path)
-        generate_metadata_file(collection_path)
-    return collection_path
-def generate_metadata_file(collection_path) -> None:
-        print("Executing generate_metadata_file")
-        metadata_path = os.path.join(collection_path, ".metadata.json")
-        file_metadata = {}  # type: Dict
-        metadata = {}
-        metadata["files"] = file_metadata
-        with open(metadata_path, "w") as f:
-            json.dump(metadata, f)