Commit 34776b63 by serpucga

Simpler, more elegant and slightly faster version using the cursors instead of…

Simpler, more elegant and slightly faster version using the cursors instead of building a list of tweets for each page
parent e093713a
...@@ -3,16 +3,21 @@ ...@@ -3,16 +3,21 @@
import pymongo import pymongo
import os import os
import argparse import argparse
from math import ceil
from lib import utils from lib import utils
def get_page_index(collection, page_size: int):
page_index = []
for i in range(0, ceil(collection.count() / page_size)):
page_index.append(get_tweets_page(collection, page_size, i))
return page_index
def get_tweets_page(collection, page_size: int, num_page: int): def get_tweets_page(collection, page_size: int, num_page: int):
tweets_page = []
tweets = collection.find().skip(num_page * page_size).limit(page_size) tweets = collection.find().skip(num_page * page_size).limit(page_size)
for tweet in tweets: return tweets
tweets_page.append(tweet)
return tweets_page
if __name__ == "__main__": if __name__ == "__main__":
...@@ -40,10 +45,10 @@ if __name__ == "__main__": ...@@ -40,10 +45,10 @@ if __name__ == "__main__":
buffer_tweets = {} buffer_tweets = {}
num_page = 0 num_page = 0
tweets_page = get_tweets_page(database_tweets, args.pagesize, num_page) page_index = get_page_index(database_tweets, args.pagesize)
while len(tweets_page) != 0: for page in page_index:
buffer_tweets = {} buffer_tweets = {}
for tweet in tweets_page: for tweet in page:
# Get output path and contents for the new CSV file # Get output path and contents for the new CSV file
csv_tweet_output_path =\ csv_tweet_output_path =\
utils.create_tweet_output_path(header, tweet, output_dir) utils.create_tweet_output_path(header, tweet, output_dir)
...@@ -65,7 +70,3 @@ if __name__ == "__main__": ...@@ -65,7 +70,3 @@ if __name__ == "__main__":
utils.increase_metadata_count( utils.increase_metadata_count(
os.path.join(output_dir, ".metadata.json"), os.path.join(output_dir, ".metadata.json"),
output_path, increase=buffer_tweets[output_path][1]) output_path, increase=buffer_tweets[output_path][1])
num_page += 1
tweets_page =\
get_tweets_page(database_tweets, args.pagesize, num_page)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment