Commit ed2c9d74 by serpucga

First parallel version of the code

Parallelized using multiprocessing library. I'm not really sure about the code being thread safe. I think we don't care if tweets are appended to the files in a different order, but the metadata files being corrupted would be problematic. In the first tests the metadata were fine, but I think this line is probably not thread safe (two threads could load try to update the old value at the same time, resulting in inconsistencies): """ metadata_file["files"][file_path]["count"] += increase """ Apart from that, code is much faster than before.
parent 34776b63
......@@ -33,7 +33,7 @@ def create_task_database_structure(
def generate_metadata_file(metadata_path) -> None:
print("Executing generate_metadata_file")
file_metadata = {} # type: Dict
file_metadata = {}
metadata = {}
metadata["files"] = file_metadata
......@@ -92,7 +92,10 @@ def create_tweet_output_path(
tweet: dict,
output_dir: str)\
-> str:
try:
collection_path = create_task_database_structure(output_dir)
except FileExistsError as e:
collection_path = e.filename
# Extract year, month and date from the tweet using a regex
matchObj = re.search(
......
......@@ -3,6 +3,7 @@
import pymongo
import os
import argparse
import multiprocessing as mp
from math import ceil
from lib import utils
......@@ -20,6 +21,33 @@ def get_tweets_page(collection, page_size: int, num_page: int):
return tweets
def write_tweets_to_files(header: str, output_dir: str, tweets_page):
print("Hi there! write_tweets_to_files executing")
buffer_tweets = {}
for tweet in tweets_page:
# Get output path and contents for the new CSV file
csv_tweet_output_path =\
utils.create_tweet_output_path(header, tweet, output_dir)
csv_tweet_contents =\
"\n" + str(utils.convert_tweet_to_csv(header, tweet))
# Check if buffer exists for the file. If not, add to dictionary
if csv_tweet_output_path not in buffer_tweets.keys():
buffer_tweets[csv_tweet_output_path] = ["", 0]
# Update the buffer adding the tweet and increasing tweet count
buffer_tweets[csv_tweet_output_path][0] += csv_tweet_contents
buffer_tweets[csv_tweet_output_path][1] += 1
# Perform the write operations in each of the files
for output_path in buffer_tweets.keys():
with open(output_path, "a") as tweet_writer:
tweet_writer.write(buffer_tweets[output_path][0])
utils.increase_metadata_count(
os.path.join(output_dir, ".metadata.json"),
output_path, increase=buffer_tweets[output_path][1])
if __name__ == "__main__":
# Command line parsing
......@@ -46,27 +74,13 @@ if __name__ == "__main__":
num_page = 0
page_index = get_page_index(database_tweets, args.pagesize)
for page in page_index:
buffer_tweets = {}
for tweet in page:
# Get output path and contents for the new CSV file
csv_tweet_output_path =\
utils.create_tweet_output_path(header, tweet, output_dir)
csv_tweet_contents =\
"\n" + str(utils.convert_tweet_to_csv(header, tweet))
# Check if buffer exists for the file. If not, add to dictionary
if csv_tweet_output_path not in buffer_tweets.keys():
buffer_tweets[csv_tweet_output_path] = ["", 0]
# Update the buffer adding the tweet and increasing tweet count
buffer_tweets[csv_tweet_output_path][0] += csv_tweet_contents
buffer_tweets[csv_tweet_output_path][1] += 1
output = mp.Queue()
processes = (mp.Process(
target=write_tweets_to_files, args=(
header, output_dir, page)) for page in page_index)
# Perform the write operations in each of the files
for output_path in buffer_tweets.keys():
with open(output_path, "a") as tweet_writer:
tweet_writer.write(buffer_tweets[output_path][0])
utils.increase_metadata_count(
os.path.join(output_dir, ".metadata.json"),
output_path, increase=buffer_tweets[output_path][1])
for p in processes:
p.start()
for p in processes:
p.join()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment