Simpler, more elegant and slightly faster version using the cursors instead of…

Simpler, more elegant and slightly faster version using the cursors instead of building a list of tweets for each page

Simpler, more elegant and slightly faster version using the cursors instead of…
34776b63 · serpucga · e093713a · 34776b63
Commit 34776b63 authored Jul 15, 2019 by serpucga
Hide whitespace changes
Inline Side-by-side

Showing with 12 additions and 11 deletions

pymongoexport_csv.py pymongoexport_csv.py +12 -11

No files found.
--- a/pymongoexport_csv.py
+++ b/pymongoexport_csv.py
@@ -3,16 +3,21 @@
 import pymongo
 import os
 import argparse
+from math import ceil

 from lib import utils


+def get_page_index(collection, page_size: int):
+    page_index = []
+    for i in range(0,  ceil(collection.count() / page_size)):
+        page_index.append(get_tweets_page(collection, page_size, i))
+    return page_index
+
+
 def get_tweets_page(collection, page_size: int, num_page: int):
-    tweets_page = []
    tweets = collection.find().skip(num_page * page_size).limit(page_size)
-    for tweet in tweets:
-        tweets_page.append(tweet)
-    return tweets_page
+    return tweets


 if __name__ == "__main__":
@@ -40,10 +45,10 @@ if __name__ == "__main__":
    buffer_tweets = {}
    num_page = 0

-    tweets_page = get_tweets_page(database_tweets, args.pagesize, num_page)
-    while len(tweets_page) != 0:
+    page_index = get_page_index(database_tweets, args.pagesize)
+    for page in page_index:
        buffer_tweets = {}
-        for tweet in tweets_page:
+        for tweet in page:
            # Get output path and contents for the new CSV file
            csv_tweet_output_path =\
                utils.create_tweet_output_path(header, tweet, output_dir)
@@ -65,7 +70,3 @@ if __name__ == "__main__":
                utils.increase_metadata_count(
                    os.path.join(output_dir, ".metadata.json"),
                    output_path, increase=buffer_tweets[output_path][1])
-
-        num_page += 1
-        tweets_page =\
-            get_tweets_page(database_tweets, args.pagesize, num_page)