Commit 36d7a65a by serpucga

Merge branch 'feature/parallelism' into develop

parents e093713a 52dbc5e9
pymongodump pymongodump
tests.py
.mypy_cache
timing = False
...@@ -3,69 +3,83 @@ ...@@ -3,69 +3,83 @@
import pymongo import pymongo
import os import os
import argparse import argparse
import logging
import time
import multiprocessing as mp
from config import globals
from lib import utils from lib import utils
# Command line parsing
parser = argparse.ArgumentParser(
description="Dump the tweets of a database to a JSON file")
parser.add_argument("-H", "--host", type=str, default="localhost")
parser.add_argument("-p", "--port", type=int, default=27017)
parser.add_argument("-s", "--pagesize", type=int, default=1000)
parser.add_argument("-v", "--verbose", action="store_true")
parser.add_argument("-t", "--timing", action="store_true")
parser.add_argument("database", type=str)
args = parser.parse_args()
def get_tweets_page(collection, page_size: int, num_page: int): # Logging config
tweets_page = [] logformat = "[%(asctime)s] %(message)s"
tweets = collection.find().skip(num_page * page_size).limit(page_size) dateformat = "%H:%M:%S"
for tweet in tweets: if args.verbose:
tweets_page.append(tweet) logging.basicConfig(
return tweets_page level=logging.DEBUG, format=logformat, datefmt=dateformat)
else:
logging.basicConfig(
if __name__ == "__main__": level=logging.ERROR, format=logformat, datefmt=dateformat)
logger = logging.getLogger(__name__)
# Command line parsing # Initialize some variables
parser = argparse.ArgumentParser( script_dir = os.path.dirname(__file__)
description="Dump the tweets of a database to a JSON file") output_dir = os.path.join(script_dir, "pymongodump", args.database)
parser.add_argument("-H", "--host", type=str, default="localhost") header_file = os.path.join(script_dir, "config", "header.txt")
parser.add_argument("-p", "--port", type=int, default=27017) with open(header_file) as f:
parser.add_argument("-s", "--pagesize", type=int, default=1000) header = f.readline()
parser.add_argument("database", type=str) buffer_tweets = {}
args = parser.parse_args() task_queue = mp.Queue()
if args.timing:
globals.timing = True
time0 = time.time()
# Dirs and files # MongoDB connection to get page index
script_dir = os.path.dirname(__file__) client = pymongo.MongoClient(args.host, args.port)
output_dir = os.path.join(script_dir, "pymongodump", args.database) database_tweets = client[args.database]["tweets"]
header_file = os.path.join(script_dir, "header.txt") page_index = utils.get_page_index(database_tweets, args.pagesize)
client.close()
logger.debug(
"Database {} partitioned in {} pages of {} tweets (maximum)"
.format(args.database, len(page_index), args.pagesize))
# MongoDB connection
client = pymongo.MongoClient(args.host, args.port)
database_tweets = client[args.database]["tweets"]
with open(header_file) as f: # Build a picklable function that we can pass to map
header = f.readline() def process_data_page(
buffer_tweets = {} page, host=args.host, port=args.port, database=args.database,
num_page = 0 pagesize=args.pagesize, header=header, outputdir=output_dir,
queue=task_queue):
tweets_page = get_tweets_page(database_tweets, args.pagesize, num_page) utils.process_page(
while len(tweets_page) != 0: host, port, database, header, output_dir, pagesize, page, queue)
buffer_tweets = {}
for tweet in tweets_page:
# Get output path and contents for the new CSV file
csv_tweet_output_path =\
utils.create_tweet_output_path(header, tweet, output_dir)
csv_tweet_contents =\
"\n" + str(utils.convert_tweet_to_csv(header, tweet))
# Check if buffer exists for the file. If not, add to dictionary
if csv_tweet_output_path not in buffer_tweets.keys():
buffer_tweets[csv_tweet_output_path] = ["", 0]
# Update the buffer adding the tweet and increasing tweet count # Launch single process to write to the filesystem
buffer_tweets[csv_tweet_output_path][0] += csv_tweet_contents writer_worker = mp.Process(
buffer_tweets[csv_tweet_output_path][1] += 1 target=utils.filesystem_writer, args=(task_queue, header, ))
writer_worker.start()
# Perform the write operations in each of the files # Launch pool of workers to perform the format conversion
for output_path in buffer_tweets.keys(): with mp.Pool() as pool:
with open(output_path, "a") as tweet_writer: pool.map(process_data_page, page_index)
tweet_writer.write(buffer_tweets[output_path][0]) task_queue.put("END")
utils.increase_metadata_count(
os.path.join(output_dir, ".metadata.json"),
output_path, increase=buffer_tweets[output_path][1])
num_page += 1 if globals.timing:
tweets_page =\ time1 = time.time()
get_tweets_page(database_tweets, args.pagesize, num_page) utils.generate_metadata_file(output_dir)
if globals.timing:
logger.critical(
"Time spent generating metadata file: {}s"
.format(time.time() - time1))
logger.critical(
"Total execution time: {}s"
.format(time.time() - time0))
import os
import json
def create_task_database_structure(
output_dir: str,
db_name: str)\
-> str:
"""
Generate the following directory tree: a top dir that will contain
all the tweet collections if it didn't exist yet and within it the top
directory for this task with a new and empty metadata file
"""
# Create the root directory for the tweet collection
if not os.path.isdir(output_dir):
print(
"Building directory to contain the collected tweets at: "
+ os.path.abspath(output_dir)
)
os.mkdir(output_dir)
collection_path = os.path.join(output_dir, db_name)
if not os.path.isdir(collection_path):
print("Initializing collection " + db_name + "...")
os.mkdir(collection_path)
generate_metadata_file(collection_path)
return collection_path
def generate_metadata_file(collection_path) -> None:
print("Executing generate_metadata_file")
metadata_path = os.path.join(collection_path, ".metadata.json")
file_metadata = {} # type: Dict
metadata = {}
metadata["files"] = file_metadata
with open(metadata_path, "w") as f:
json.dump(metadata, f)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment