Generation of recovery file achieved

c9b7fb65 · serpucga · 053f7b0f · c9b7fb65 · c9b7fb65 · c9b7fb65
Commit c9b7fb65 authored Jul 19, 2019 by serpucga
Hide whitespace changes
Inline Side-by-side

Showing with 100 additions and 34 deletions

globals.py config/globals.py +0 -1

utils.py lib/utils.py +95 -26

pymongoexport_csv.py pymongoexport_csv.py +5 -7

No files found.
--- a/config/globals.py
+++ b/config/globals.py
 timing = False
-dumped_pages = []
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -15,7 +15,15 @@ import logging
 logger = logging.getLogger(__name__)


-def filesystem_writer(queue: mp.Queue, header: str) -> None:
+def filesystem_writer(
+        queue: mp.Queue,
+        header: str,
+        host: str,
+        port: int,
+        database: str,
+        pagesize: int,
+        output_dir: str)\
+        -> None:
    """
    Reads the CSV pages from the queue and writes them to filesystem

@@ -31,30 +39,40 @@ def filesystem_writer(queue: mp.Queue, header: str) -> None:
    logger.debug(
        "Worker {} launched: filesystem_writer executing"
        .format(os.getpid()))
+    recovery_file_path = os.path.join(
+        output_dir, ".recovery_" + database + ".csv")
+    create_recovery_file(
+        recovery_file_path, host, port, database, pagesize)

    while True:
-        csv_page = queue.get()
+        page_number, csv_page = queue.get()
+
        if csv_page == "END":
            logger.info("Writing loop finished")
+            os.remove(recovery_file_path)
            break
-        if globals.timing and csv_page is not None:
-            time0 = time.time()
-        for output_path in csv_page.keys():
-            logger.debug("Dumping tweets for " + output_path)
-            if os.path.isfile(output_path):
-                with open(output_path, "a") as writer:
-                    writer.write(csv_page[output_path])
+        elif csv_page is not None:
+            if globals.timing:
+                time0 = time.time()
+            for output_path in csv_page.keys():
+                logger.debug("Dumping tweets for " + output_path)
+                if os.path.isfile(output_path):
+                    with open(output_path, "a") as writer:
+                        writer.write(csv_page[output_path])
+                else:
+                    logger.debug("File {} not found, generating new..."
+                                 .format(output_path))
+                    generate_path(output_path, header)
+                    with open(output_path, "a") as writer:
+                        writer.write(csv_page[output_path])
+            if page_number >= 0:
+                update_recovery_file(recovery_file_path, page_number)
+            if globals.timing:
+                logger.critical(
+                    "Time spent writing tweet page to FS: {}s"
+                    .format(time.time() - time0))
            else:
-                logger.debug("File {} not found, generating new..."
-                             .format(output_path))
-                generate_path(output_path, header)
-                with open(output_path, "a") as writer:
-                    writer.write(csv_page[output_path])
-        if globals.timing and csv_page is not None:
-            time1 = time.time()
-            logger.critical(
-                "Time spent writing tweet page to FS: {}s"
-                .format(time1 - time0))
+                continue


 def process_page(
@@ -101,7 +119,7 @@ def process_page(
        buffer_tweets[csv_tweet_output_path] += csv_tweet_contents
    client.close()

-    queue.put(buffer_tweets)
+    queue.put((page_number, buffer_tweets))

    logger.debug("Page {} enqueued".format(page_number))
    if globals.timing:
@@ -218,9 +236,9 @@ def dump_recovery_file(
        port: int,
        database: str,
        page_size: int,
-        dumped_pages: List[int],
-        error_page: int,
-        output_dir: str)\
+        dumped_pages: list,
+        output_dir: str,
+        error_page: int = None)\
        -> None:
    """
    In case of error, dump information to file to allow recovery
@@ -240,16 +258,67 @@ def dump_recovery_file(
    recovery_file_contents["port"] = port
    recovery_file_contents["database"] = database
    recovery_file_contents["pagesize"] = page_size
-    recovery_file_contents["dumped_pages"] = globals.dumped_pages
-    recovery_file_contents["error_page"] = error_page
+    recovery_file_contents["dumped_pages"] = dumped_pages
+    recovery_file_contents["error_page"] = str(error_page)

+    logger.debug(
+        "HERE DUMPED_PAGES: {}"
+        .format(dumped_pages))
    with open(recovery_file_path, "w") as f:
-        json.dump(f)
+        json.dump(recovery_file_contents, f)

    logger.error(
        "Generated recovery file at {}".format(recovery_file_path))


+def create_recovery_file(
+        file_path: str,
+        host: str,
+        port: int,
+        database: str,
+        page_size: int)\
+        -> None:
+    """
+    In case of error, dump information to file to allow recovery
+
+    :param host: address of the host to which the script connected
+    :param port: port of the Mongo database
+    :param database: name of the database being queried
+    :param page_size: size of the page that was being used
+    """
+
+    recovery_file_contents = {}
+    recovery_file_contents["host"] = host
+    recovery_file_contents["port"] = port
+    recovery_file_contents["database"] = database
+    recovery_file_contents["pagesize"] = page_size
+    recovery_file_contents["dumped_pages"] = []
+
+    parent_dir = os.path.split(file_path)[0]
+    if not os.path.exists(parent_dir):
+        os.makedirs(parent_dir)
+    with open(file_path, "w") as f:
+        json.dump(recovery_file_contents, f)
+
+    logger.error("Generated recovery file at {}".format(file_path))
+
+
+def update_recovery_file(
+        file_path: str,
+        page_number: int)\
+        -> None:
+    """
+    Add a new page to the list of already dumped pages in the recovery file
+
+    """
+
+    with open(file_path, "r") as f:
+        recovery_file_contents = json.load(f)
+    recovery_file_contents["dumped_pages"].append(page_number)
+    with open(file_path, "w") as f:
+        json.dump(recovery_file_contents, f)
+
+
 #########################
 #  TWEET DB PAGINATION  #
 #########################

--- a/pymongoexport_csv.py
+++ b/pymongoexport_csv.py
@@ -66,20 +66,18 @@ def process_data_page(
 try:
    # Launch single process to write to the filesystem
    writer_worker = mp.Process(
-        target=utils.filesystem_writer, args=(task_queue, header, ))
+        target=utils.filesystem_writer,
+        args=(task_queue, header, args.host, args.port,
+              args.database, args.pagesize, output_dir))
    writer_worker.start()

    # Launch pool of workers to perform the format conversion
    with mp.Pool() as pool:
        pool.map(process_data_page, page_index)
-    task_queue.put("END")
+    task_queue.put((-1, "END"))

-except Exception as exc:
+except (KeyboardInterrupt, Exception):
    logger.error("A fatal error occurred. Script will terminate")
-    error_page = exc  # Change this
-    utils.dump_recovery_file(
-        args.host, args.port, args.database, args.pagesize,
-        globals.dumped_pages, error_page, output_dir)


 if globals.timing: