Commit 888acbe2 by serpucga

Merge branch 'feature/fault_tolerance' into develop

parents 36d7a65a ab69fb73
pymongodump pymongodump
recovery
tests.py tests.py
.mypy_cache .mypy_cache
.recovery*
...@@ -2,9 +2,11 @@ ...@@ -2,9 +2,11 @@
import pymongo import pymongo
import os import os
import sys
import argparse import argparse
import logging import logging
import time import time
import json
import multiprocessing as mp import multiprocessing as mp
from config import globals from config import globals
from lib import utils from lib import utils
...@@ -17,6 +19,7 @@ parser.add_argument("-p", "--port", type=int, default=27017) ...@@ -17,6 +19,7 @@ parser.add_argument("-p", "--port", type=int, default=27017)
parser.add_argument("-s", "--pagesize", type=int, default=1000) parser.add_argument("-s", "--pagesize", type=int, default=1000)
parser.add_argument("-v", "--verbose", action="store_true") parser.add_argument("-v", "--verbose", action="store_true")
parser.add_argument("-t", "--timing", action="store_true") parser.add_argument("-t", "--timing", action="store_true")
parser.add_argument("-r", "--recovery", type=str)
parser.add_argument("database", type=str) parser.add_argument("database", type=str)
args = parser.parse_args() args = parser.parse_args()
...@@ -44,13 +47,32 @@ if args.timing: ...@@ -44,13 +47,32 @@ if args.timing:
time0 = time.time() time0 = time.time()
# MongoDB connection to get page index # MongoDB connection to get page index
client = pymongo.MongoClient(args.host, args.port) if args.recovery:
database_tweets = client[args.database]["tweets"] with open(args.recovery) as f:
page_index = utils.get_page_index(database_tweets, args.pagesize) recovery_data = json.load(f)
client.close() client = pymongo.MongoClient(
logger.debug( recovery_data["host"], recovery_data["port"])
"Database {} partitioned in {} pages of {} tweets (maximum)" database_tweets = client[recovery_data["database"]]["tweets"]
.format(args.database, len(page_index), args.pagesize)) full_page_index = utils.get_page_index(
database_tweets, recovery_data["pagesize"])
client.close()
page_index = [page for page in full_page_index
if page not in recovery_data["dumped_pages"]]
if "error_page" in recovery_data:
logger.debug("Discarding corrupted page")
page_index.remove(recovery_data.pop("error_page"))
logger.debug(
"Resuming collection conversion. {} of {} pages left."
.format(len(page_index), len(full_page_index)))
else:
client = pymongo.MongoClient(args.host, args.port)
database_tweets = client[args.database]["tweets"]
page_index = utils.get_page_index(database_tweets, args.pagesize)
client.close()
logger.debug(
"Database {} partitioned in {} pages of {} tweets (maximum)"
.format(args.database, len(page_index), args.pagesize))
# Build a picklable function that we can pass to map # Build a picklable function that we can pass to map
...@@ -65,17 +87,31 @@ def process_data_page( ...@@ -65,17 +87,31 @@ def process_data_page(
# Launch single process to write to the filesystem # Launch single process to write to the filesystem
writer_worker = mp.Process( writer_worker = mp.Process(
target=utils.filesystem_writer, args=(task_queue, header, )) target=utils.filesystem_writer,
args=(task_queue, header, args.host, args.port,
args.database, args.pagesize, output_dir, args.recovery))
writer_worker.start() writer_worker.start()
# Launch pool of workers to perform the format conversion # Launch pool of workers to perform the format conversion
with mp.Pool() as pool: try:
pool.map(process_data_page, page_index) with mp.Pool() as pool:
task_queue.put("END") pool.map(process_data_page, page_index)
except utils.ExceptionAtPage as exc:
logger.error("Error detected at page {}".format(exc.error_page))
task_queue.put((exc.error_page, "ERROR"))
sys.exit(1)
except (Exception, KeyboardInterrupt):
logger.error("Error detected")
task_queue.put((-2, "ERROR"))
sys.exit(1)
task_queue.put((-1, "END"))
if globals.timing: if globals.timing:
time1 = time.time() time1 = time.time()
utils.generate_metadata_file(output_dir) utils.generate_metadata_file(output_dir)
logger.info("Metadata file created")
if globals.timing: if globals.timing:
logger.critical( logger.critical(
"Time spent generating metadata file: {}s" "Time spent generating metadata file: {}s"
...@@ -83,3 +119,5 @@ if globals.timing: ...@@ -83,3 +119,5 @@ if globals.timing:
logger.critical( logger.critical(
"Total execution time: {}s" "Total execution time: {}s"
.format(time.time() - time0)) .format(time.time() - time0))
logger.info("Conversion completed successfully!!")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment