Commit 36d7a65a by serpucga

Merge branch 'feature/parallelism' into develop

parents e093713a 52dbc5e9
pymongodump
tests.py
.mypy_cache
timing = False
......@@ -3,69 +3,83 @@
import pymongo
import os
import argparse
import logging
import time
import multiprocessing as mp
from config import globals
from lib import utils
# Command line parsing
parser = argparse.ArgumentParser(
description="Dump the tweets of a database to a JSON file")
parser.add_argument("-H", "--host", type=str, default="localhost")
parser.add_argument("-p", "--port", type=int, default=27017)
parser.add_argument("-s", "--pagesize", type=int, default=1000)
parser.add_argument("-v", "--verbose", action="store_true")
parser.add_argument("-t", "--timing", action="store_true")
parser.add_argument("database", type=str)
args = parser.parse_args()
def get_tweets_page(collection, page_size: int, num_page: int):
tweets_page = []
tweets = collection.find().skip(num_page * page_size).limit(page_size)
for tweet in tweets:
tweets_page.append(tweet)
return tweets_page
if __name__ == "__main__":
# Logging config
logformat = "[%(asctime)s] %(message)s"
dateformat = "%H:%M:%S"
if args.verbose:
logging.basicConfig(
level=logging.DEBUG, format=logformat, datefmt=dateformat)
else:
logging.basicConfig(
level=logging.ERROR, format=logformat, datefmt=dateformat)
logger = logging.getLogger(__name__)
# Command line parsing
parser = argparse.ArgumentParser(
description="Dump the tweets of a database to a JSON file")
parser.add_argument("-H", "--host", type=str, default="localhost")
parser.add_argument("-p", "--port", type=int, default=27017)
parser.add_argument("-s", "--pagesize", type=int, default=1000)
parser.add_argument("database", type=str)
args = parser.parse_args()
# Initialize some variables
script_dir = os.path.dirname(__file__)
output_dir = os.path.join(script_dir, "pymongodump", args.database)
header_file = os.path.join(script_dir, "config", "header.txt")
with open(header_file) as f:
header = f.readline()
buffer_tweets = {}
task_queue = mp.Queue()
if args.timing:
globals.timing = True
time0 = time.time()
# Dirs and files
script_dir = os.path.dirname(__file__)
output_dir = os.path.join(script_dir, "pymongodump", args.database)
header_file = os.path.join(script_dir, "header.txt")
# MongoDB connection to get page index
client = pymongo.MongoClient(args.host, args.port)
database_tweets = client[args.database]["tweets"]
page_index = utils.get_page_index(database_tweets, args.pagesize)
client.close()
logger.debug(
"Database {} partitioned in {} pages of {} tweets (maximum)"
.format(args.database, len(page_index), args.pagesize))
# MongoDB connection
client = pymongo.MongoClient(args.host, args.port)
database_tweets = client[args.database]["tweets"]
with open(header_file) as f:
header = f.readline()
buffer_tweets = {}
num_page = 0
# Build a picklable function that we can pass to map
def process_data_page(
page, host=args.host, port=args.port, database=args.database,
pagesize=args.pagesize, header=header, outputdir=output_dir,
queue=task_queue):
tweets_page = get_tweets_page(database_tweets, args.pagesize, num_page)
while len(tweets_page) != 0:
buffer_tweets = {}
for tweet in tweets_page:
# Get output path and contents for the new CSV file
csv_tweet_output_path =\
utils.create_tweet_output_path(header, tweet, output_dir)
csv_tweet_contents =\
"\n" + str(utils.convert_tweet_to_csv(header, tweet))
utils.process_page(
host, port, database, header, output_dir, pagesize, page, queue)
# Check if buffer exists for the file. If not, add to dictionary
if csv_tweet_output_path not in buffer_tweets.keys():
buffer_tweets[csv_tweet_output_path] = ["", 0]
# Update the buffer adding the tweet and increasing tweet count
buffer_tweets[csv_tweet_output_path][0] += csv_tweet_contents
buffer_tweets[csv_tweet_output_path][1] += 1
# Launch single process to write to the filesystem
writer_worker = mp.Process(
target=utils.filesystem_writer, args=(task_queue, header, ))
writer_worker.start()
# Perform the write operations in each of the files
for output_path in buffer_tweets.keys():
with open(output_path, "a") as tweet_writer:
tweet_writer.write(buffer_tweets[output_path][0])
utils.increase_metadata_count(
os.path.join(output_dir, ".metadata.json"),
output_path, increase=buffer_tweets[output_path][1])
# Launch pool of workers to perform the format conversion
with mp.Pool() as pool:
pool.map(process_data_page, page_index)
task_queue.put("END")
num_page += 1
tweets_page =\
get_tweets_page(database_tweets, args.pagesize, num_page)
if globals.timing:
time1 = time.time()
utils.generate_metadata_file(output_dir)
if globals.timing:
logger.critical(
"Time spent generating metadata file: {}s"
.format(time.time() - time1))
logger.critical(
"Total execution time: {}s"
.format(time.time() - time0))
import os
import json
def create_task_database_structure(
output_dir: str,
db_name: str)\
-> str:
"""
Generate the following directory tree: a top dir that will contain
all the tweet collections if it didn't exist yet and within it the top
directory for this task with a new and empty metadata file
"""
# Create the root directory for the tweet collection
if not os.path.isdir(output_dir):
print(
"Building directory to contain the collected tweets at: "
+ os.path.abspath(output_dir)
)
os.mkdir(output_dir)
collection_path = os.path.join(output_dir, db_name)
if not os.path.isdir(collection_path):
print("Initializing collection " + db_name + "...")
os.mkdir(collection_path)
generate_metadata_file(collection_path)
return collection_path
def generate_metadata_file(collection_path) -> None:
print("Executing generate_metadata_file")
metadata_path = os.path.join(collection_path, ".metadata.json")
file_metadata = {} # type: Dict
metadata = {}
metadata["files"] = file_metadata
with open(metadata_path, "w") as f:
json.dump(metadata, f)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment