Commit 3cab07f8 by serpucga

Added pagination system to elude memory issues

parent 16f47745
......@@ -7,46 +7,65 @@ import argparse
from lib import utils
# Command line parsing
parser = argparse.ArgumentParser(
description="Dump the tweets of a database to a JSON file")
parser.add_argument("-H", "--host", type=str, default="localhost")
parser.add_argument("-p", "--port", type=int, default=27017)
parser.add_argument("database", type=str)
args = parser.parse_args()
# Dirs and files
script_dir = os.path.dirname(__file__)
output_dir = os.path.join(script_dir, "pymongodump", args.database)
header_file = os.path.join(script_dir, "header.txt")
# MongoDB connection
client = pymongo.MongoClient(args.host, args.port)
database_tweets = client[args.database]["tweets"]
with open(header_file) as f:
header = f.readline()
buffer_tweets = {}
for tweet in database_tweets.find():
# Get output path and contents for the new CSV file
csv_tweet_output_path =\
utils.create_tweet_output_path(tweet, output_dir)
csv_tweet_contents =\
"\n" + str(utils.convert_tweet_to_csv(header, tweet))
# Check if buffer exists for the file. If not, add to dictionary
if csv_tweet_output_path not in buffer_tweets.keys():
buffer_tweets[csv_tweet_output_path] = ["", 0]
# Update the buffer adding the tweet and increasing tweet count
buffer_tweets[csv_tweet_output_path][0] += csv_tweet_contents
buffer_tweets[csv_tweet_output_path][1] += 1
# Perform the write operations in each of the files
for output_path in buffer_tweets.keys():
with open(output_path, "a") as tweet_writer:
tweet_writer.write(buffer_tweets[output_path][0])
utils.increase_metadata_count(
os.path.join(output_dir, ".metadata.json"),
output_path, increase=buffer_tweets[output_path][1])
def get_tweets_page(collection, page_size: int, num_page: int):
tweets_page = []
tweets = collection.find().skip(num_page * page_size).limit(page_size)
for tweet in tweets:
tweets_page.append(tweet)
return tweets_page
if __name__ == "__main__":
# Command line parsing
parser = argparse.ArgumentParser(
description="Dump the tweets of a database to a JSON file")
parser.add_argument("-H", "--host", type=str, default="localhost")
parser.add_argument("-p", "--port", type=int, default=27017)
parser.add_argument("-s", "--pagesize", type=int, default=1000)
parser.add_argument("database", type=str)
args = parser.parse_args()
# Dirs and files
script_dir = os.path.dirname(__file__)
output_dir = os.path.join(script_dir, "pymongodump", args.database)
header_file = os.path.join(script_dir, "header.txt")
# MongoDB connection
client = pymongo.MongoClient(args.host, args.port)
database_tweets = client[args.database]["tweets"]
with open(header_file) as f:
header = f.readline()
buffer_tweets = {}
num_page = 0
tweets_page = get_tweets_page(database_tweets, args.pagesize, num_page)
while len(tweets_page) != 0:
buffer_tweets = {}
for tweet in tweets_page:
# Get output path and contents for the new CSV file
csv_tweet_output_path =\
utils.create_tweet_output_path(tweet, output_dir)
csv_tweet_contents =\
"\n" + str(utils.convert_tweet_to_csv(header, tweet))
# Check if buffer exists for the file. If not, add to dictionary
if csv_tweet_output_path not in buffer_tweets.keys():
buffer_tweets[csv_tweet_output_path] = ["", 0]
# Update the buffer adding the tweet and increasing tweet count
buffer_tweets[csv_tweet_output_path][0] += csv_tweet_contents
buffer_tweets[csv_tweet_output_path][1] += 1
# Perform the write operations in each of the files
for output_path in buffer_tweets.keys():
with open(output_path, "a") as tweet_writer:
tweet_writer.write(buffer_tweets[output_path][0])
utils.increase_metadata_count(
os.path.join(output_dir, ".metadata.json"),
output_path, increase=buffer_tweets[output_path][1])
num_page += 1
tweets_page =\
get_tweets_page(database_tweets, args.pagesize, num_page)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment