Commit 888acbe2 by serpucga

Merge branch 'feature/fault_tolerance' into develop

parents 36d7a65a ab69fb73
pymongodump
recovery
tests.py
.mypy_cache
.recovery*
......@@ -2,9 +2,11 @@
import pymongo
import os
import sys
import argparse
import logging
import time
import json
import multiprocessing as mp
from config import globals
from lib import utils
......@@ -17,6 +19,7 @@ parser.add_argument("-p", "--port", type=int, default=27017)
parser.add_argument("-s", "--pagesize", type=int, default=1000)
parser.add_argument("-v", "--verbose", action="store_true")
parser.add_argument("-t", "--timing", action="store_true")
parser.add_argument("-r", "--recovery", type=str)
parser.add_argument("database", type=str)
args = parser.parse_args()
......@@ -44,13 +47,32 @@ if args.timing:
time0 = time.time()
# MongoDB connection to get page index
client = pymongo.MongoClient(args.host, args.port)
database_tweets = client[args.database]["tweets"]
page_index = utils.get_page_index(database_tweets, args.pagesize)
client.close()
logger.debug(
"Database {} partitioned in {} pages of {} tweets (maximum)"
.format(args.database, len(page_index), args.pagesize))
if args.recovery:
with open(args.recovery) as f:
recovery_data = json.load(f)
client = pymongo.MongoClient(
recovery_data["host"], recovery_data["port"])
database_tweets = client[recovery_data["database"]]["tweets"]
full_page_index = utils.get_page_index(
database_tweets, recovery_data["pagesize"])
client.close()
page_index = [page for page in full_page_index
if page not in recovery_data["dumped_pages"]]
if "error_page" in recovery_data:
logger.debug("Discarding corrupted page")
page_index.remove(recovery_data.pop("error_page"))
logger.debug(
"Resuming collection conversion. {} of {} pages left."
.format(len(page_index), len(full_page_index)))
else:
client = pymongo.MongoClient(args.host, args.port)
database_tweets = client[args.database]["tweets"]
page_index = utils.get_page_index(database_tweets, args.pagesize)
client.close()
logger.debug(
"Database {} partitioned in {} pages of {} tweets (maximum)"
.format(args.database, len(page_index), args.pagesize))
# Build a picklable function that we can pass to map
......@@ -65,17 +87,31 @@ def process_data_page(
# Launch single process to write to the filesystem
writer_worker = mp.Process(
target=utils.filesystem_writer, args=(task_queue, header, ))
target=utils.filesystem_writer,
args=(task_queue, header, args.host, args.port,
args.database, args.pagesize, output_dir, args.recovery))
writer_worker.start()
# Launch pool of workers to perform the format conversion
with mp.Pool() as pool:
pool.map(process_data_page, page_index)
task_queue.put("END")
try:
with mp.Pool() as pool:
pool.map(process_data_page, page_index)
except utils.ExceptionAtPage as exc:
logger.error("Error detected at page {}".format(exc.error_page))
task_queue.put((exc.error_page, "ERROR"))
sys.exit(1)
except (Exception, KeyboardInterrupt):
logger.error("Error detected")
task_queue.put((-2, "ERROR"))
sys.exit(1)
task_queue.put((-1, "END"))
if globals.timing:
time1 = time.time()
utils.generate_metadata_file(output_dir)
logger.info("Metadata file created")
if globals.timing:
logger.critical(
"Time spent generating metadata file: {}s"
......@@ -83,3 +119,5 @@ if globals.timing:
logger.critical(
"Total execution time: {}s"
.format(time.time() - time0))
logger.info("Conversion completed successfully!!")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment