First definitive version. May contain bugs

a4b15d31 · serpucga · d1923e7e · 8d7eb4a2 · a4b15d31 · a4b15d31
Commit a4b15d31 authored Jul 25, 2019 by serpucga
7 changed files
--- a/.gitignore
+++ b/.gitignore
+pymongodump
+recovery
+tests.py
+.mypy_cache
+.recovery*
--- a/config/globals.py
+++ b/config/globals.py
+timing = False
--- a/config/header.txt
+++ b/config/header.txt
+id,text,created_at,source,truncated,in_reply_to_status_id,in_reply_to_user_id,in_reply_to_screen_name,quoted_status_id,is_quote_status,retweet_count,favorite_count,user.id,user.name,user.created_at,user.screen_name,user.location,user.profile_image_url,user.verified,user.followers_count,user.friends_count,user.listed_count,user.favourites_count,user.statuses_count,user.geo_enabled,user.lang,entities.hashtags.text,entities.urls.expanded_url,entities.user_mentions.screen_name,entities.media.media_url,place.id,place.name,place.full_name,place.country,place.country_code,place.place_type,place.url,place.bounding_box.type,place.bounding_box.coordinates,coordinates.type,coordinates.coordinates
--- a/lib/utils.py
+++ b/lib/utils.py
--- a/pymongoexport_csv.py
+++ b/pymongoexport_csv.py
+#!/usr/bin/env python
+import pymongo
+import os
+import sys
+import argparse
+import logging
+import time
+import json
+import multiprocessing as mp
+from config import globals
+from lib import utils
+# Command line parsing
+parser = argparse.ArgumentParser(
+    description="Dump the tweets of a database to a JSON file")
+parser.add_argument("-H", "--host", type=str, default="localhost")
+parser.add_argument("-p", "--port", type=int, default=27017)
+parser.add_argument("-s", "--pagesize", type=int, default=1000)
+parser.add_argument("-v", "--verbose", action="store_true")
+parser.add_argument("-t", "--timing", action="store_true")
+parser.add_argument("-r", "--recovery", type=str)
+parser.add_argument("database", type=str)
+args = parser.parse_args()
+# Logging config
+logformat = "[%(asctime)s] %(message)s"
+dateformat = "%H:%M:%S"
+if args.verbose:
+    logging.basicConfig(
+        level=logging.DEBUG, format=logformat, datefmt=dateformat)
+else:
+    logging.basicConfig(
+        level=logging.ERROR, format=logformat, datefmt=dateformat)
+logger = logging.getLogger(__name__)
+# Initialize some variables
+script_dir = os.path.dirname(__file__)
+output_dir = os.path.join(script_dir, "pymongodump", args.database)
+header_file = os.path.join(script_dir, "config", "header.txt")
+with open(header_file) as f:
+    header = f.readline()
+buffer_tweets = {}
+task_queue = mp.Queue()
+if args.timing:
+    globals.timing = True
+    time0 = time.time()
+# MongoDB connection to get page index
+logger.debug("The indexing of the collection may take a while if "
+             + "the collection is too big. Please, be patient...")
+if args.recovery:
+    with open(args.recovery) as f:
+        recovery_data = json.load(f)
+    client = pymongo.MongoClient(
+        recovery_data["host"], recovery_data["port"])
+    database_tweets = client[recovery_data["database"]]["tweets"]
+    page_index = utils.get_page_index_fast(
+        database_tweets, recovery_data["pagesize"])
+    client.close()
+    full_page_index_len = len(page_index)
+    for page in recovery_data["dumped_pages"]:
+        page_index.pop(page, None)
+    if "error_page" in recovery_data:
+        logger.debug("Discarding corrupted page")
+        page_index.pop(recovery_data.pop("error_page"))
+    logger.debug(
+        "Resuming collection conversion. {} of {} pages left."
+        .format(len(page_index), full_page_index_len))
+else:
+    client = pymongo.MongoClient(args.host, args.port)
+    database_tweets = client[args.database]["tweets"]
+    page_index = utils.get_page_index_fast(database_tweets, args.pagesize)
+    client.close()
+    logger.debug(
+        "Database {} partitioned in {} pages of {} tweets (maximum)"
+        .format(args.database, len(page_index), args.pagesize))
+# Build a picklable function that we can pass to map
+def process_data_page(
+        page, host=args.host, port=args.port, database=args.database,
+        pagesize=args.pagesize, header=header, outputdir=output_dir,
+        queue=task_queue, page_index=page_index):
+    utils.process_page(
+        host, port, database, header, output_dir, pagesize, page,
+        page_index, queue)
+# Launch single process to write to the filesystem
+try:
+    writer_worker = mp.Process(
+        target=utils.filesystem_writer,
+        args=(task_queue, header, args.host, args.port,
+              args.database, args.pagesize, output_dir, args.recovery))
+    writer_worker.start()
+except Exception:
+    logger.error("There was a failure in the filesystem writer", exc_info=True)
+    os.system("spd-say 'Something really bad happened!'")
+    sys.exit(1)
+# Launch pool of workers to perform the format conversion
+try:
+    with mp.Pool() as pool:
+        pool.map(process_data_page, page_index.keys())
+except utils.ExceptionAtPage as exc:
+    logger.error("Error detected at page {}".format(exc.error_page))
+    task_queue.put((exc.error_page, "ERROR"))
+    os.system("spd-say 'Something really bad happened!'")
+    sys.exit(1)
+except (Exception, KeyboardInterrupt):
+    logger.error("Error detected", exc_info=True)
+    task_queue.put((-2, "ERROR"))
+    os.system("spd-say 'Something really bad happened!'")
+    sys.exit(1)
+task_queue.put((-1, "END"))
+if globals.timing:
+    time1 = time.time()
+try:
+    utils.generate_metadata_file(output_dir)
+    logger.info("Metadata file created")
+except (Exception, KeyboardInterrupt):
+    logger.error("The collection was converted correctly to CSV, but something"
+                 + " failed when generating the metadata file", exc_info=True)
+    os.system("spd-say 'Something really bad happened!'")
+    sys.exit(1)
+if globals.timing:
+    logger.critical(
+        "Time spent generating metadata file: {}s"
+        .format(time.time() - time1))
+    logger.critical(
+        "Total execution time: {}s"
+        .format(time.time() - time0))
+os.system('spd-say "Conversion completed successfully!"')
+logger.info("Conversion completed successfully!!")
--- a/pymongoexport.py
+++ b/pymongoexport.py
@@ -3,6 +3,8 @@
 import pymongo
 import os
 import argparse
+import pprint
+from tweet_manager.lib.json2csv import flatten
 parser = argparse.ArgumentParser(
    description="Dump the tweets of a database to a JSON file")

--- a/requirements.txt
+++ b/requirements.txt
 pymongo==3.4.0
+tweetmanager-serpucga==1.1.7