Commit 34776b63 by serpucga

Simpler, more elegant and slightly faster version using the cursors instead of…

Simpler, more elegant and slightly faster version using the cursors instead of building a list of tweets for each page
parent e093713a
......@@ -3,16 +3,21 @@
import pymongo
import os
import argparse
from math import ceil
from lib import utils
def get_page_index(collection, page_size: int):
page_index = []
for i in range(0, ceil(collection.count() / page_size)):
page_index.append(get_tweets_page(collection, page_size, i))
return page_index
def get_tweets_page(collection, page_size: int, num_page: int):
tweets_page = []
tweets = collection.find().skip(num_page * page_size).limit(page_size)
for tweet in tweets:
tweets_page.append(tweet)
return tweets_page
return tweets
if __name__ == "__main__":
......@@ -40,10 +45,10 @@ if __name__ == "__main__":
buffer_tweets = {}
num_page = 0
tweets_page = get_tweets_page(database_tweets, args.pagesize, num_page)
while len(tweets_page) != 0:
page_index = get_page_index(database_tweets, args.pagesize)
for page in page_index:
buffer_tweets = {}
for tweet in tweets_page:
for tweet in page:
# Get output path and contents for the new CSV file
csv_tweet_output_path =\
utils.create_tweet_output_path(header, tweet, output_dir)
......@@ -65,7 +70,3 @@ if __name__ == "__main__":
utils.increase_metadata_count(
os.path.join(output_dir, ".metadata.json"),
output_path, increase=buffer_tweets[output_path][1])
num_page += 1
tweets_page =\
get_tweets_page(database_tweets, args.pagesize, num_page)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment