Added fast versions for getting and indexing pages

Found that old pagination system based on skip() and limit() scaled terribly bad for large collections. However, if the indexing isn't based on making a skip but instead in asking for the tweets with a higher or lesser value for one field, the query is much much faster. Thus, using the "id" unique field as index pagination and retrieval system can work for large collections.

Added fast versions for getting and indexing pages
2612720f · serpucga · 9ba7bac1 · 2612720f · 2612720f
Commit 2612720f authored Jul 24, 2019 by serpucga
Hide whitespace changes
Inline Side-by-side

Showing with 65 additions and 2 deletions

utils.py lib/utils.py +64 -1

pymongoexport_csv.py pymongoexport_csv.py +1 -1

No files found.
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -115,7 +115,8 @@ def process_page(

        client = pymongo.MongoClient(host, port)
        database_tweets = client[database]["tweets"]
-        tweets_page = get_tweets_page(database_tweets, pagesize, page_number)
+        tweets_page = get_tweets_page_fast(
+            database_tweets, pagesize, page_number)
        buffer_tweets = {}

        for tweet in tweets_page:
@@ -383,6 +384,68 @@ def get_tweets_page(
    return tweets


+def get_page_index_fast(
+        collection: pymongo.collection.Collection,
+        page_size: int)\
+        -> List[int]:
+    """
+    Get a list of the pages indexed by their tweet ID.
+
+    Skip is very slow for large collections where we need to skip
+    millions of records. Thus, it is much better for performance
+    to paginate with references to some identifier field, in this
+    case, "id" of tweets. This function finds the first and last ID
+    for a page of "page_size" tweets, and then asks in a loop for the
+    next page of tweets after the last found ID. This way, it builds
+    a list with the first ID for each page. The user will be able to
+    get that page i by asking for the "page_size" tweets with ID lesser
+    than page[i] (IDs are sorted in descending order). The loop stops
+    adding pages when it finds one that is not complete.
+
+    :param collection: pymongo collection of tweets
+    :param page_size: number of tweets in each page
+    :returns: list of indexes, using ID
+    """
+
+    pages = []
+    first_page = collection.find()\
+                           .sort("id", pymongo.DESCENDING)\
+                           .limit(page_size)
+    pages.append(first_page[0]["id"])
+    last_id = first_page[page_size - 1]["id"]
+    while True:
+        page = collection.find({"id": {"$lt": last_id}})\
+                         .sort("id", pymongo.DESCENDING)\
+                         .limit(page_size)
+        pages.append(page[0]["id"])
+        try:
+            last_id = page[page_size - 1]["id"]
+        except IndexError:
+            break
+    return pages
+
+
+def get_tweets_page_fast(
+        collection: pymongo.collection.Collection,
+        page_size: int,
+        page_index: int)\
+        -> pymongo.cursor.Cursor:
+    """
+    Get a cursor pointing to the Mongo entries for that page
+
+    :param collection: pymongo collection of tweets
+    :param page_size: number of tweets in each page
+    :param num_page: relative index of the page within the collection
+    :returns: a Pymongo cursor pointing to the tweets
+    """
+
+    tweets = collection\
+        .find({"id": {"$lt": page_index}})\
+        .sort("id", pymongo.DESCENDING)\
+        .limit(page_size)
+    return tweets
+
+
 #########################
 #  METADATA GENERATION  #
 #########################

--- a/pymongoexport_csv.py
+++ b/pymongoexport_csv.py
@@ -68,7 +68,7 @@ if args.recovery:
 else:
    client = pymongo.MongoClient(args.host, args.port)
    database_tweets = client[args.database]["tweets"]
-    page_index = utils.get_page_index(database_tweets, args.pagesize)
+    page_index = utils.get_page_index_fast(database_tweets, args.pagesize)
    client.close()
    logger.debug(
        "Database {} partitioned in {} pages of {} tweets (maximum)"