Added pagination system to elude memory issues

3cab07f8 · serpucga · 16f47745 · 3cab07f8
Commit 3cab07f8 authored Jul 12, 2019 by serpucga
Hide whitespace changes
Inline Side-by-side

Showing with 62 additions and 43 deletions

pymongoexport_csv.py pymongoexport_csv.py +62 -43

No files found.
--- a/pymongoexport_csv.py
+++ b/pymongoexport_csv.py
@@ -7,46 +7,65 @@ import argparse
 from lib import utils


-# Command line parsing
-parser = argparse.ArgumentParser(
-    description="Dump the tweets of a database to a JSON file")
-parser.add_argument("-H", "--host", type=str, default="localhost")
-parser.add_argument("-p", "--port", type=int, default=27017)
-parser.add_argument("database", type=str)
-args = parser.parse_args()
-
-# Dirs and files
-script_dir = os.path.dirname(__file__)
-output_dir = os.path.join(script_dir, "pymongodump", args.database)
-header_file = os.path.join(script_dir, "header.txt")
-
-# MongoDB connection
-client = pymongo.MongoClient(args.host, args.port)
-database_tweets = client[args.database]["tweets"]
-
-with open(header_file) as f:
-    header = f.readline()
-
-buffer_tweets = {}
-for tweet in database_tweets.find():
-    # Get output path and contents for the new CSV file
-    csv_tweet_output_path =\
-        utils.create_tweet_output_path(tweet, output_dir)
-    csv_tweet_contents =\
-        "\n" + str(utils.convert_tweet_to_csv(header, tweet))
-
-    # Check if buffer exists for the file. If not, add to dictionary
-    if csv_tweet_output_path not in buffer_tweets.keys():
-        buffer_tweets[csv_tweet_output_path] = ["", 0]
-
-    # Update the buffer adding the tweet and increasing tweet count
-    buffer_tweets[csv_tweet_output_path][0] += csv_tweet_contents
-    buffer_tweets[csv_tweet_output_path][1] += 1
-
-# Perform the write operations in each of the files
-for output_path in buffer_tweets.keys():
-    with open(output_path, "a") as tweet_writer:
-        tweet_writer.write(buffer_tweets[output_path][0])
-        utils.increase_metadata_count(
-            os.path.join(output_dir, ".metadata.json"),
-            output_path, increase=buffer_tweets[output_path][1])
+def get_tweets_page(collection, page_size: int, num_page: int):
+    tweets_page = []
+    tweets = collection.find().skip(num_page * page_size).limit(page_size)
+    for tweet in tweets:
+        tweets_page.append(tweet)
+    return tweets_page
+
+
+if __name__ == "__main__":
+
+    # Command line parsing
+    parser = argparse.ArgumentParser(
+        description="Dump the tweets of a database to a JSON file")
+    parser.add_argument("-H", "--host", type=str, default="localhost")
+    parser.add_argument("-p", "--port", type=int, default=27017)
+    parser.add_argument("-s", "--pagesize", type=int, default=1000)
+    parser.add_argument("database", type=str)
+    args = parser.parse_args()
+
+    # Dirs and files
+    script_dir = os.path.dirname(__file__)
+    output_dir = os.path.join(script_dir, "pymongodump", args.database)
+    header_file = os.path.join(script_dir, "header.txt")
+
+    # MongoDB connection
+    client = pymongo.MongoClient(args.host, args.port)
+    database_tweets = client[args.database]["tweets"]
+
+    with open(header_file) as f:
+        header = f.readline()
+    buffer_tweets = {}
+    num_page = 0
+
+    tweets_page = get_tweets_page(database_tweets, args.pagesize, num_page)
+    while len(tweets_page) != 0:
+        buffer_tweets = {}
+        for tweet in tweets_page:
+            # Get output path and contents for the new CSV file
+            csv_tweet_output_path =\
+                utils.create_tweet_output_path(tweet, output_dir)
+            csv_tweet_contents =\
+                "\n" + str(utils.convert_tweet_to_csv(header, tweet))
+
+            # Check if buffer exists for the file. If not, add to dictionary
+            if csv_tweet_output_path not in buffer_tweets.keys():
+                buffer_tweets[csv_tweet_output_path] = ["", 0]
+
+            # Update the buffer adding the tweet and increasing tweet count
+            buffer_tweets[csv_tweet_output_path][0] += csv_tweet_contents
+            buffer_tweets[csv_tweet_output_path][1] += 1
+
+        # Perform the write operations in each of the files
+        for output_path in buffer_tweets.keys():
+            with open(output_path, "a") as tweet_writer:
+                tweet_writer.write(buffer_tweets[output_path][0])
+                utils.increase_metadata_count(
+                    os.path.join(output_dir, ".metadata.json"),
+                    output_path, increase=buffer_tweets[output_path][1])
+
+        num_page += 1
+        tweets_page =\
+            get_tweets_page(database_tweets, args.pagesize, num_page)