Commit 2612720f by serpucga

Added fast versions for getting and indexing pages

Found that old pagination system based on skip() and limit() scaled terribly bad for large collections. However, if the indexing isn't based on making a skip but instead in asking for the tweets with a higher or lesser value for one field, the query is much much faster. Thus, using the "id" unique field as index pagination and retrieval system can work for large collections.
parent 9ba7bac1
......@@ -115,7 +115,8 @@ def process_page(
client = pymongo.MongoClient(host, port)
database_tweets = client[database]["tweets"]
tweets_page = get_tweets_page(database_tweets, pagesize, page_number)
tweets_page = get_tweets_page_fast(
database_tweets, pagesize, page_number)
buffer_tweets = {}
for tweet in tweets_page:
......@@ -383,6 +384,68 @@ def get_tweets_page(
return tweets
def get_page_index_fast(
collection: pymongo.collection.Collection,
page_size: int)\
-> List[int]:
"""
Get a list of the pages indexed by their tweet ID.
Skip is very slow for large collections where we need to skip
millions of records. Thus, it is much better for performance
to paginate with references to some identifier field, in this
case, "id" of tweets. This function finds the first and last ID
for a page of "page_size" tweets, and then asks in a loop for the
next page of tweets after the last found ID. This way, it builds
a list with the first ID for each page. The user will be able to
get that page i by asking for the "page_size" tweets with ID lesser
than page[i] (IDs are sorted in descending order). The loop stops
adding pages when it finds one that is not complete.
:param collection: pymongo collection of tweets
:param page_size: number of tweets in each page
:returns: list of indexes, using ID
"""
pages = []
first_page = collection.find()\
.sort("id", pymongo.DESCENDING)\
.limit(page_size)
pages.append(first_page[0]["id"])
last_id = first_page[page_size - 1]["id"]
while True:
page = collection.find({"id": {"$lt": last_id}})\
.sort("id", pymongo.DESCENDING)\
.limit(page_size)
pages.append(page[0]["id"])
try:
last_id = page[page_size - 1]["id"]
except IndexError:
break
return pages
def get_tweets_page_fast(
collection: pymongo.collection.Collection,
page_size: int,
page_index: int)\
-> pymongo.cursor.Cursor:
"""
Get a cursor pointing to the Mongo entries for that page
:param collection: pymongo collection of tweets
:param page_size: number of tweets in each page
:param num_page: relative index of the page within the collection
:returns: a Pymongo cursor pointing to the tweets
"""
tweets = collection\
.find({"id": {"$lt": page_index}})\
.sort("id", pymongo.DESCENDING)\
.limit(page_size)
return tweets
#########################
# METADATA GENERATION #
#########################
......
......@@ -68,7 +68,7 @@ if args.recovery:
else:
client = pymongo.MongoClient(args.host, args.port)
database_tweets = client[args.database]["tweets"]
page_index = utils.get_page_index(database_tweets, args.pagesize)
page_index = utils.get_page_index_fast(database_tweets, args.pagesize)
client.close()
logger.debug(
"Database {} partitioned in {} pages of {} tweets (maximum)"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment