Added dict-like indexing of the pagination system to show "relative" pagination…

Added dict-like indexing of the pagination system to show "relative" pagination in the logs and the recovery file instead of the tweet ids, which would be confusing for the user

Added dict-like indexing of the pagination system to show "relative" pagination…
f05f9d63 · serpucga · a0751265 · f05f9d63 · f05f9d63
Commit f05f9d63 authored Jul 24, 2019 by serpucga
Hide whitespace changes
Inline Side-by-side

Showing with 22 additions and 13 deletions

utils.py lib/utils.py +10 -5

pymongoexport_csv.py pymongoexport_csv.py +12 -8

No files found.
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -6,7 +6,7 @@ import time
 import datetime
 import multiprocessing as mp
 from math import ceil
-from typing import List
+from typing import List, Dict
 from tweet_manager.lib import json2csv, format_csv
 from config import globals
@@ -91,7 +91,8 @@ def process_page(
        header: str,
        output_dir: str,
        pagesize: int,
-        page_number: List[int],
+        page_number: int,
+        page_index: Dict[int, int],
        queue: mp.Queue)\
        -> None:
    """
@@ -113,10 +114,11 @@ def process_page(
        if globals.timing:
            time0 = time.time()
+        real_page_number = page_index[page_number]
        client = pymongo.MongoClient(host, port)
        database_tweets = client[database]["tweets"]
        tweets_page = get_tweets_page_fast(
-            database_tweets, pagesize, page_number)
+            database_tweets, pagesize, real_page_number)
        buffer_tweets = {}
        for tweet in tweets_page:
@@ -387,7 +389,7 @@ def get_tweets_page(
 def get_page_index_fast(
        collection: pymongo.collection.Collection,
        page_size: int)\
-        -> List[int]:
+        -> Dict[int, int]:
    """
    Get a list of the pages indexed by their tweet ID.
@@ -422,7 +424,10 @@ def get_page_index_fast(
            last_id = page[page_size - 1]["id"]
        except IndexError:
            break
-    return pages
+    pages_index = {}
+    for i in range(len(pages)):
+        pages_index[i] = pages[i]
+    return pages_index
 def get_tweets_page_fast(

--- a/pymongoexport_csv.py
+++ b/pymongoexport_csv.py
@@ -47,23 +47,26 @@ if args.timing:
    time0 = time.time()
 # MongoDB connection to get page index
+logger.debug("The indexing of the collection may take a while if "
+             + "the collection is too big. Please, be patient...")
 if args.recovery:
    with open(args.recovery) as f:
        recovery_data = json.load(f)
    client = pymongo.MongoClient(
        recovery_data["host"], recovery_data["port"])
    database_tweets = client[recovery_data["database"]]["tweets"]
-    full_page_index = utils.get_page_index_fast(
+    page_index = utils.get_page_index_fast(
        database_tweets, recovery_data["pagesize"])
    client.close()
-    page_index = [page for page in full_page_index
+    full_page_index_len = len(page_index)
-                  if page not in recovery_data["dumped_pages"]]
+    for page in recovery_data["dumped_pages"]:
+        page_index.pop(page, None)
    if "error_page" in recovery_data:
        logger.debug("Discarding corrupted page")
-        page_index.remove(recovery_data.pop("error_page"))
+        page_index.pop(recovery_data.pop("error_page"))
    logger.debug(
        "Resuming collection conversion. {} of {} pages left."
-        .format(len(page_index), len(full_page_index)))
+        .format(len(page_index), full_page_index_len))
 else:
    client = pymongo.MongoClient(args.host, args.port)
@@ -79,10 +82,11 @@ else:
 def process_data_page(
        page, host=args.host, port=args.port, database=args.database,
        pagesize=args.pagesize, header=header, outputdir=output_dir,
-        queue=task_queue):
+        queue=task_queue, page_index=page_index):
    utils.process_page(
-        host, port, database, header, output_dir, pagesize, page, queue)
+        host, port, database, header, output_dir, pagesize, page,
+        page_index, queue)
 # Launch single process to write to the filesystem
@@ -99,7 +103,7 @@ except Exception:
 # Launch pool of workers to perform the format conversion
 try:
    with mp.Pool() as pool:
-        pool.map(process_data_page, page_index)
+        pool.map(process_data_page, page_index.keys())
 except utils.ExceptionAtPage as exc:
    logger.error("Error detected at page {}".format(exc.error_page))
    task_queue.put((exc.error_page, "ERROR"))