Commit 888acbe2 by serpucga

Merge branch 'feature/fault_tolerance' into develop

parents 36d7a65a ab69fb73
pymongodump
recovery
tests.py
.mypy_cache
.recovery*
......@@ -3,6 +3,7 @@ import pymongo
import json
import re
import time
import datetime
import multiprocessing as mp
from math import ceil
from typing import List
......@@ -15,7 +16,16 @@ import logging
logger = logging.getLogger(__name__)
def filesystem_writer(queue: mp.Queue, header: str) -> None:
def filesystem_writer(
queue: mp.Queue,
header: str,
host: str,
port: int,
database: str,
pagesize: int,
output_dir: str,
recovery_file: str)\
-> None:
"""
Reads the CSV pages from the queue and writes them to filesystem
......@@ -31,30 +41,47 @@ def filesystem_writer(queue: mp.Queue, header: str) -> None:
logger.debug(
"Worker {} launched: filesystem_writer executing"
.format(os.getpid()))
if recovery_file:
recovery_file_path = recovery_file
else:
recovery_file_path = build_recovery_filepath(database)
create_recovery_file(
recovery_file_path, host, port, database, pagesize)
while True:
csv_page = queue.get()
page_number, csv_page = queue.get()
if csv_page == "END":
logger.info("Writing loop finished")
os.remove(recovery_file_path)
break
if globals.timing and csv_page is not None:
time0 = time.time()
for output_path in csv_page.keys():
logger.debug("Dumping tweets for " + output_path)
if os.path.isfile(output_path):
with open(output_path, "a") as writer:
writer.write(csv_page[output_path])
elif csv_page == "ERROR":
logger.error("Dumping recovery file and exiting")
if recovery_file_path >= 0:
dump_error_recovery_file(recovery_file_path, page_number)
break
elif csv_page is not None:
if globals.timing:
time0 = time.time()
for output_path in csv_page.keys():
logger.debug("Dumping tweets for " + output_path)
if os.path.isfile(output_path):
with open(output_path, "a") as writer:
writer.write(csv_page[output_path])
else:
logger.debug("File {} not found, generating new..."
.format(output_path))
generate_path(output_path, header)
with open(output_path, "a") as writer:
writer.write(csv_page[output_path])
if page_number >= 0:
update_recovery_file(recovery_file_path, page_number)
if globals.timing:
logger.critical(
"Time spent writing tweet page to FS: {}s"
.format(time.time() - time0))
else:
logger.debug("File {} not found, generating new..."
.format(output_path))
generate_path(output_path, header)
with open(output_path, "a") as writer:
writer.write(csv_page[output_path])
if globals.timing and csv_page is not None:
time1 = time.time()
logger.critical(
"Time spent writing tweet page to FS: {}s"
.format(time1 - time0))
continue
def process_page(
......@@ -80,35 +107,45 @@ def process_page(
:param queue: queue were processed data await to be written to FS
"""
logger.debug(
"Worker {} launched: process_page executing".format(os.getpid()))
if globals.timing:
time0 = time.time()
client = pymongo.MongoClient(host, port)
database_tweets = client[database]["tweets"]
tweets_page = get_tweets_page(database_tweets, pagesize, page_number)
buffer_tweets = {}
for tweet in tweets_page:
csv_tweet_output_path =\
get_tweet_output_path(tweet, output_dir)
csv_tweet_contents =\
"\n" + str(convert_tweet_to_csv(header, tweet))
if csv_tweet_output_path not in buffer_tweets.keys():
buffer_tweets[csv_tweet_output_path] = ""
buffer_tweets[csv_tweet_output_path] += csv_tweet_contents
client.close()
try:
logger.debug(
"Worker {} launched: process_page executing".format(os.getpid()))
if globals.timing:
time0 = time.time()
queue.put(buffer_tweets)
client = pymongo.MongoClient(host, port)
database_tweets = client[database]["tweets"]
tweets_page = get_tweets_page(database_tweets, pagesize, page_number)
buffer_tweets = {}
for tweet in tweets_page:
csv_tweet_output_path =\
get_tweet_output_path(tweet, output_dir)
try:
csv_tweet_contents =\
"\n" + str(convert_tweet_to_csv(header, tweet))
except TweetConversionException as exc:
logger.error(exc.message)
logger.error("Origin tweet:\n" + str(exc.tweet))
logger.error("Discarding tweet and proceeding...")
continue
if csv_tweet_output_path not in buffer_tweets.keys():
buffer_tweets[csv_tweet_output_path] = ""
buffer_tweets[csv_tweet_output_path] += csv_tweet_contents
client.close()
queue.put((page_number, buffer_tweets))
logger.debug("Page {} enqueued".format(page_number))
if globals.timing:
time1 = time.time()
logger.critical(
"Time processing & buffering tweet page: {}s"
.format(time1 - time0))
logger.debug("Page {} enqueued".format(page_number))
if globals.timing:
time1 = time.time()
logger.critical(
"Time processing & buffering tweet page: {}s"
.format(time1 - time0))
except Exception:
raise ExceptionAtPage(
"Something failed while processing page", page_number)
def get_tweet_output_path(tweet: dict, output_dir: str) -> str:
......@@ -206,13 +243,109 @@ def convert_tweet_to_csv(header: str, tweet: dict) -> str:
fields in CSV form
"""
flat_tweet = json2csv.flatten_dictionary(tweet)
csv_tweet = json2csv.json2csv(flat_tweet, True, 5, False)
csv_appendable_tweet = format_csv.get_csv_line(header, csv_tweet)
try:
flat_tweet = json2csv.flatten_dictionary(tweet)
except Exception:
raise TweetConversionException(
"Error when flattening tweet", tweet)
try:
csv_tweet = json2csv.json2csv(flat_tweet, True, 5, False)
except Exception:
raise TweetConversionException(
"Error when trying to convert tweet to CSV", flat_tweet)
try:
csv_appendable_tweet = format_csv.get_csv_line(
header, csv_tweet)
except Exception:
raise TweetConversionException(
"Error when formatting CSV tweet", csv_tweet)
return csv_appendable_tweet
def build_recovery_filepath(dbname: str) -> str:
"""
Build the path of a recovery file
:param dbname: name of the database being queried
:returns: the path of the recovery file generated in this execution
"""
recovery_dir = "./recovery"
if not os.path.isdir(recovery_dir):
os.mkdir(recovery_dir)
now = datetime.datetime.now()
datetime_str = "%04d%02d%02d-%02d%02d%02d" %\
(now.year, now.month, now.day, now.hour, now.minute, now.second)
recovery_file_path = os.path.join(
recovery_dir,
"recovery_" + dbname + "_" + datetime_str + ".json")
return recovery_file_path
def create_recovery_file(
file_path: str,
host: str,
port: int,
database: str,
page_size: int)\
-> None:
"""
In case of error, dump information to file to allow recovery
:param host: address of the host to which the script connected
:param port: port of the Mongo database
:param database: name of the database being queried
:param page_size: size of the page that was being used
"""
recovery_file_contents = {}
recovery_file_contents["host"] = host
recovery_file_contents["port"] = port
recovery_file_contents["database"] = database
recovery_file_contents["pagesize"] = page_size
recovery_file_contents["dumped_pages"] = []
with open(file_path, "w") as f:
json.dump(recovery_file_contents, f)
logger.error("Generated recovery file at {}".format(file_path))
def update_recovery_file(
file_path: str,
page_number: int)\
-> None:
"""
Add a new page to the list of already dumped pages in the recovery
file
:param file_path: path to the recovery file
:param page_number: number of the page that was safely written
"""
with open(file_path, "r") as f:
recovery_file_contents = json.load(f)
recovery_file_contents["dumped_pages"].append(page_number)
with open(file_path, "w") as f:
json.dump(recovery_file_contents, f)
def dump_error_recovery_file(
file_path: str,
page_number: int)\
-> None:
"""
Add information pointing to the page where error was detected
:param file_path: path to the recovery file
:param page_number: number of the page that crashed
"""
with open(file_path, "r") as f:
recovery_file_contents = json.load(f)
recovery_file_contents["error_page"] = page_number
with open(file_path, "w") as f:
json.dump(recovery_file_contents, f)
#########################
# TWEET DB PAGINATION #
#########################
......@@ -292,3 +425,38 @@ def file_length(file_path: str) -> int:
for i, l in enumerate(f):
pass
return i
#######################
# CUSTOM EXCEPTIONS #
#######################
class ExceptionAtPage(Exception):
"""
Exception designed to be raised when the conversion of a page of
tweets taken from Mongo fails
"""
def __init__(self, message: str, error_page: int):
"""
:param message: str descriptive of the error
:param error_page: int indicating the number of page that failed
"""
self.message = message
self.error_page = error_page
class TweetConversionException(Exception):
"""
Should be raised when a tweet raises an exception in the process of
being converted
"""
def __init__(self, message: str, tweet: str):
"""
:param message: str descriptive of the error
:param tweet: str with the contents of the tweet that caused the
failure
"""
self.message = message
self.tweet = tweet
......@@ -2,9 +2,11 @@
import pymongo
import os
import sys
import argparse
import logging
import time
import json
import multiprocessing as mp
from config import globals
from lib import utils
......@@ -17,6 +19,7 @@ parser.add_argument("-p", "--port", type=int, default=27017)
parser.add_argument("-s", "--pagesize", type=int, default=1000)
parser.add_argument("-v", "--verbose", action="store_true")
parser.add_argument("-t", "--timing", action="store_true")
parser.add_argument("-r", "--recovery", type=str)
parser.add_argument("database", type=str)
args = parser.parse_args()
......@@ -44,13 +47,32 @@ if args.timing:
time0 = time.time()
# MongoDB connection to get page index
client = pymongo.MongoClient(args.host, args.port)
database_tweets = client[args.database]["tweets"]
page_index = utils.get_page_index(database_tweets, args.pagesize)
client.close()
logger.debug(
"Database {} partitioned in {} pages of {} tweets (maximum)"
.format(args.database, len(page_index), args.pagesize))
if args.recovery:
with open(args.recovery) as f:
recovery_data = json.load(f)
client = pymongo.MongoClient(
recovery_data["host"], recovery_data["port"])
database_tweets = client[recovery_data["database"]]["tweets"]
full_page_index = utils.get_page_index(
database_tweets, recovery_data["pagesize"])
client.close()
page_index = [page for page in full_page_index
if page not in recovery_data["dumped_pages"]]
if "error_page" in recovery_data:
logger.debug("Discarding corrupted page")
page_index.remove(recovery_data.pop("error_page"))
logger.debug(
"Resuming collection conversion. {} of {} pages left."
.format(len(page_index), len(full_page_index)))
else:
client = pymongo.MongoClient(args.host, args.port)
database_tweets = client[args.database]["tweets"]
page_index = utils.get_page_index(database_tweets, args.pagesize)
client.close()
logger.debug(
"Database {} partitioned in {} pages of {} tweets (maximum)"
.format(args.database, len(page_index), args.pagesize))
# Build a picklable function that we can pass to map
......@@ -65,17 +87,31 @@ def process_data_page(
# Launch single process to write to the filesystem
writer_worker = mp.Process(
target=utils.filesystem_writer, args=(task_queue, header, ))
target=utils.filesystem_writer,
args=(task_queue, header, args.host, args.port,
args.database, args.pagesize, output_dir, args.recovery))
writer_worker.start()
# Launch pool of workers to perform the format conversion
with mp.Pool() as pool:
pool.map(process_data_page, page_index)
task_queue.put("END")
try:
with mp.Pool() as pool:
pool.map(process_data_page, page_index)
except utils.ExceptionAtPage as exc:
logger.error("Error detected at page {}".format(exc.error_page))
task_queue.put((exc.error_page, "ERROR"))
sys.exit(1)
except (Exception, KeyboardInterrupt):
logger.error("Error detected")
task_queue.put((-2, "ERROR"))
sys.exit(1)
task_queue.put((-1, "END"))
if globals.timing:
time1 = time.time()
utils.generate_metadata_file(output_dir)
logger.info("Metadata file created")
if globals.timing:
logger.critical(
"Time spent generating metadata file: {}s"
......@@ -83,3 +119,5 @@ if globals.timing:
logger.critical(
"Total execution time: {}s"
.format(time.time() - time0))
logger.info("Conversion completed successfully!!")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment