Added new mode of execution, 'recovery', which allows to continue execution of a…

Added new mode of execution, 'recovery', which allows to continue execution of a task by loading a recovery file from a previous process

Added new mode of execution, 'recovery', which allows to continue execution of a…
174863a2 · serpucga · c9b7fb65 · 174863a2 · 174863a2 · 174863a2
Commit 174863a2 authored Jul 19, 2019 by serpucga
Hide whitespace changes
Inline Side-by-side

Showing with 44 additions and 29 deletions

.gitignore .gitignore +1 -0

utils.py lib/utils.py +8 -8

pymongoexport_csv.py pymongoexport_csv.py +35 -21

No files found.
--- a/.gitignore
+++ b/.gitignore
 pymongodump
 tests.py
 .mypy_cache
+.recovery*
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -22,7 +22,8 @@ def filesystem_writer(
        port: int,
        database: str,
        pagesize: int,
-        output_dir: str)\
+        output_dir: str,
+        recovery_file: str)\
        -> None:
    """
    Reads the CSV pages from the queue and writes them to filesystem
@@ -39,10 +40,12 @@ def filesystem_writer(
    logger.debug(
        "Worker {} launched: filesystem_writer executing"
        .format(os.getpid()))
-    recovery_file_path = os.path.join(
-        output_dir, ".recovery_" + database + ".csv")
-    create_recovery_file(
-        recovery_file_path, host, port, database, pagesize)
+    if recovery_file:
+        recovery_file_path = recovery_file
+    else:
+        recovery_file_path = ".recovery_" + database + ".csv"
+        create_recovery_file(
+            recovery_file_path, host, port, database, pagesize)

    while True:
        page_number, csv_page = queue.get()
@@ -294,9 +297,6 @@ def create_recovery_file(
    recovery_file_contents["pagesize"] = page_size
    recovery_file_contents["dumped_pages"] = []

-    parent_dir = os.path.split(file_path)[0]
-    if not os.path.exists(parent_dir):
-        os.makedirs(parent_dir)
    with open(file_path, "w") as f:
        json.dump(recovery_file_contents, f)


--- a/pymongoexport_csv.py
+++ b/pymongoexport_csv.py
@@ -5,6 +5,7 @@ import os
 import argparse
 import logging
 import time
+import json
 import multiprocessing as mp
 from config import globals
 from lib import utils
@@ -17,6 +18,7 @@ parser.add_argument("-p", "--port", type=int, default=27017)
 parser.add_argument("-s", "--pagesize", type=int, default=1000)
 parser.add_argument("-v", "--verbose", action="store_true")
 parser.add_argument("-t", "--timing", action="store_true")
+parser.add_argument("-r", "--recovery", type=str)
 parser.add_argument("database", type=str)
 args = parser.parse_args()

@@ -44,13 +46,29 @@ if args.timing:
    time0 = time.time()

 # MongoDB connection to get page index
-client = pymongo.MongoClient(args.host, args.port)
-database_tweets = client[args.database]["tweets"]
-page_index = utils.get_page_index(database_tweets, args.pagesize)
-client.close()
-logger.debug(
-    "Database {} partitioned in {} pages of {} tweets (maximum)"
-    .format(args.database, len(page_index), args.pagesize))
+if args.recovery:
+    with open(args.recovery) as f:
+        recovery_data = json.load(f)
+    client = pymongo.MongoClient(
+        recovery_data["host"], recovery_data["port"])
+    database_tweets = client[recovery_data["database"]]["tweets"]
+    full_page_index = utils.get_page_index(
+        database_tweets, recovery_data["pagesize"])
+    client.close()
+    page_index = [page for page in full_page_index
+                  if page not in recovery_data["dumped_pages"]]
+    logger.debug(
+        "Resuming collection conversion. {} of {} pages left."
+        .format(len(page_index), len(full_page_index)))
+
+else:
+    client = pymongo.MongoClient(args.host, args.port)
+    database_tweets = client[args.database]["tweets"]
+    page_index = utils.get_page_index(database_tweets, args.pagesize)
+    client.close()
+    logger.debug(
+        "Database {} partitioned in {} pages of {} tweets (maximum)"
+        .format(args.database, len(page_index), args.pagesize))


 # Build a picklable function that we can pass to map
@@ -63,21 +81,17 @@ def process_data_page(
        host, port, database, header, output_dir, pagesize, page, queue)


-try:
-    # Launch single process to write to the filesystem
-    writer_worker = mp.Process(
-        target=utils.filesystem_writer,
-        args=(task_queue, header, args.host, args.port,
-              args.database, args.pagesize, output_dir))
-    writer_worker.start()
-
-    # Launch pool of workers to perform the format conversion
-    with mp.Pool() as pool:
-        pool.map(process_data_page, page_index)
-    task_queue.put((-1, "END"))
+# Launch single process to write to the filesystem
+writer_worker = mp.Process(
+    target=utils.filesystem_writer,
+    args=(task_queue, header, args.host, args.port,
+          args.database, args.pagesize, output_dir, args.recovery))
+writer_worker.start()

-except (KeyboardInterrupt, Exception):
-    logger.error("A fatal error occurred. Script will terminate")
+# Launch pool of workers to perform the format conversion
+with mp.Pool() as pool:
+    pool.map(process_data_page, page_index)
+task_queue.put((-1, "END"))


 if globals.timing: