Commit a4b15d31 by serpucga

First definitive version. May contain bugs

parents d1923e7e 8d7eb4a2
pymongodump
recovery
tests.py
.mypy_cache
.recovery*
timing = False
id,text,created_at,source,truncated,in_reply_to_status_id,in_reply_to_user_id,in_reply_to_screen_name,quoted_status_id,is_quote_status,retweet_count,favorite_count,user.id,user.name,user.created_at,user.screen_name,user.location,user.profile_image_url,user.verified,user.followers_count,user.friends_count,user.listed_count,user.favourites_count,user.statuses_count,user.geo_enabled,user.lang,entities.hashtags.text,entities.urls.expanded_url,entities.user_mentions.screen_name,entities.media.media_url,place.id,place.name,place.full_name,place.country,place.country_code,place.place_type,place.url,place.bounding_box.type,place.bounding_box.coordinates,coordinates.type,coordinates.coordinates
This diff is collapsed. Click to expand it.
#!/usr/bin/env python
import pymongo
import os
import sys
import argparse
import logging
import time
import json
import multiprocessing as mp
from config import globals
from lib import utils
# Command line parsing
parser = argparse.ArgumentParser(
description="Dump the tweets of a database to a JSON file")
parser.add_argument("-H", "--host", type=str, default="localhost")
parser.add_argument("-p", "--port", type=int, default=27017)
parser.add_argument("-s", "--pagesize", type=int, default=1000)
parser.add_argument("-v", "--verbose", action="store_true")
parser.add_argument("-t", "--timing", action="store_true")
parser.add_argument("-r", "--recovery", type=str)
parser.add_argument("database", type=str)
args = parser.parse_args()
# Logging config
logformat = "[%(asctime)s] %(message)s"
dateformat = "%H:%M:%S"
if args.verbose:
logging.basicConfig(
level=logging.DEBUG, format=logformat, datefmt=dateformat)
else:
logging.basicConfig(
level=logging.ERROR, format=logformat, datefmt=dateformat)
logger = logging.getLogger(__name__)
# Initialize some variables
script_dir = os.path.dirname(__file__)
output_dir = os.path.join(script_dir, "pymongodump", args.database)
header_file = os.path.join(script_dir, "config", "header.txt")
with open(header_file) as f:
header = f.readline()
buffer_tweets = {}
task_queue = mp.Queue()
if args.timing:
globals.timing = True
time0 = time.time()
# MongoDB connection to get page index
logger.debug("The indexing of the collection may take a while if "
+ "the collection is too big. Please, be patient...")
if args.recovery:
with open(args.recovery) as f:
recovery_data = json.load(f)
client = pymongo.MongoClient(
recovery_data["host"], recovery_data["port"])
database_tweets = client[recovery_data["database"]]["tweets"]
page_index = utils.get_page_index_fast(
database_tweets, recovery_data["pagesize"])
client.close()
full_page_index_len = len(page_index)
for page in recovery_data["dumped_pages"]:
page_index.pop(page, None)
if "error_page" in recovery_data:
logger.debug("Discarding corrupted page")
page_index.pop(recovery_data.pop("error_page"))
logger.debug(
"Resuming collection conversion. {} of {} pages left."
.format(len(page_index), full_page_index_len))
else:
client = pymongo.MongoClient(args.host, args.port)
database_tweets = client[args.database]["tweets"]
page_index = utils.get_page_index_fast(database_tweets, args.pagesize)
client.close()
logger.debug(
"Database {} partitioned in {} pages of {} tweets (maximum)"
.format(args.database, len(page_index), args.pagesize))
# Build a picklable function that we can pass to map
def process_data_page(
page, host=args.host, port=args.port, database=args.database,
pagesize=args.pagesize, header=header, outputdir=output_dir,
queue=task_queue, page_index=page_index):
utils.process_page(
host, port, database, header, output_dir, pagesize, page,
page_index, queue)
# Launch single process to write to the filesystem
try:
writer_worker = mp.Process(
target=utils.filesystem_writer,
args=(task_queue, header, args.host, args.port,
args.database, args.pagesize, output_dir, args.recovery))
writer_worker.start()
except Exception:
logger.error("There was a failure in the filesystem writer", exc_info=True)
os.system("spd-say 'Something really bad happened!'")
sys.exit(1)
# Launch pool of workers to perform the format conversion
try:
with mp.Pool() as pool:
pool.map(process_data_page, page_index.keys())
except utils.ExceptionAtPage as exc:
logger.error("Error detected at page {}".format(exc.error_page))
task_queue.put((exc.error_page, "ERROR"))
os.system("spd-say 'Something really bad happened!'")
sys.exit(1)
except (Exception, KeyboardInterrupt):
logger.error("Error detected", exc_info=True)
task_queue.put((-2, "ERROR"))
os.system("spd-say 'Something really bad happened!'")
sys.exit(1)
task_queue.put((-1, "END"))
if globals.timing:
time1 = time.time()
try:
utils.generate_metadata_file(output_dir)
logger.info("Metadata file created")
except (Exception, KeyboardInterrupt):
logger.error("The collection was converted correctly to CSV, but something"
+ " failed when generating the metadata file", exc_info=True)
os.system("spd-say 'Something really bad happened!'")
sys.exit(1)
if globals.timing:
logger.critical(
"Time spent generating metadata file: {}s"
.format(time.time() - time1))
logger.critical(
"Total execution time: {}s"
.format(time.time() - time0))
os.system('spd-say "Conversion completed successfully!"')
logger.info("Conversion completed successfully!!")
...@@ -3,6 +3,8 @@ ...@@ -3,6 +3,8 @@
import pymongo import pymongo
import os import os
import argparse import argparse
import pprint
from tweet_manager.lib.json2csv import flatten
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Dump the tweets of a database to a JSON file") description="Dump the tweets of a database to a JSON file")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment