Changed way of generating metadata file

Before it was done in the same way than it is done in UTool, by increasing an entry in the metadata file by X each time that X tweets are added to that CSV. However, for a script that converts from Mongo to CSV static collections that are not growing in size, it is just better to just count the number of lines of each CSV file once that the conversion process has ended. This also supresses the risk of the metadata being corrupted due to bad parallelization

Changed way of generating metadata file
2e50b803 · serpucga · 1f695bf1 · 2e50b803 · 2e50b803 · 1f695bf1
Commit 2e50b803 authored Jul 16, 2019 by serpucga
Hide whitespace changes
Inline Side-by-side

Showing with 37 additions and 106 deletions

utils.py lib/utils.py +35 -67

pymongoexport_csv.py pymongoexport_csv.py +2 -0

utils.py utils.py +0 -39

No files found.
--- a/lib/utils.py
+++ b/lib/utils.py
 import os
-import json
 import re
 import pymongo
+import json
 from math import ceil
 from tweet_manager.lib import json2csv, format_csv

@@ -12,7 +12,7 @@ def create_task_database_structure(
    """
    Generate the following directory tree: a top dir that will contain
    all the tweet collections if it didn't exist yet and within it the top
-    directory for this task with a new and empty metadata file
+    directory for this task
    """

    # Create the root directory for the tweet collection
@@ -27,68 +27,10 @@ def create_task_database_structure(
    if not os.path.isdir(collection_path):
        print("Initializing collection " + db_name + "...")
        os.mkdir(collection_path)
-        metadata_path = os.path.join(collection_path, ".metadata.json")
-        generate_metadata_file(metadata_path)

    return collection_path


-def generate_metadata_file(metadata_path) -> None:
-    print("Executing generate_metadata_file")
-    file_metadata = {}
-    metadata = {}
-    metadata["files"] = file_metadata
-
-    with open(metadata_path, "w") as f:
-        json.dump(metadata, f)
-
-
-def add_newfile_to_metadata(file_path: str, metadata_path: str) -> None:
-    """
-    Add a new dictionary structure to the metadata file that contains
-    information about a newly added CSV. This should just be user for files
-    that have just been added to the collection, because it initializes the
-    count to 0
-    """
-
-    print("Executing add_newfile_to_metadata")
-    try:
-        with open(metadata_path, "r+") as f:
-            metadata_file = json.load(f)
-            metadata_file["files"][file_path] = {}
-            metadata_file["files"][file_path]["count"] = 0
-            f.seek(0)
-            f.truncate()
-            json.dump(metadata_file, f)
-    except IOError:
-        generate_metadata_file(metadata_path)
-        add_newfile_to_metadata(file_path, metadata_path)
-
-
-def increase_metadata_count(
-        metadata_path: str,
-        file_path: str,
-        increase: int = 1)\
-        -> None:
-    """
-    Use this when one tweet is appended to one of the CSVs in the
-    collection. This function will update the metadata file by increasing
-    by x the corresponding dictionary structure
-    """
-
-    print("Executing increase_metadata_count")
-    try:
-        with open(metadata_path, "r+") as f:
-            metadata_file = json.load(f)
-            metadata_file["files"][file_path]["count"] += increase
-            f.seek(0)
-            f.truncate()
-            json.dump(metadata_file, f)
-    except IOError:
-        generate_metadata_file(metadata_path)
-        increase_metadata_count(metadata_path, file_path, increase)
-
-
 def create_tweet_output_path(
        header: str,
        tweet: dict,
@@ -116,9 +58,6 @@ def create_tweet_output_path(
    if os.path.isfile(tweet_output_file) is False:
        with open(tweet_output_file, "w") as fw:
            fw.write(header.strip())
-        add_newfile_to_metadata(
-            tweet_output_file,
-            os.path.join(collection_path, ".metadata.json"))

    return tweet_output_file

@@ -166,13 +105,42 @@ def write_tweets_to_files(host, port, database, pagesize, header: str,

        # Update the buffer adding the tweet and increasing tweet count
        buffer_tweets[csv_tweet_output_path][0] += csv_tweet_contents
-        buffer_tweets[csv_tweet_output_path][1] += 1
+        # buffer_tweets[csv_tweet_output_path][1] += 1

    client.close()
    # Perform the write operations in each of the files
    for output_path in buffer_tweets.keys():
        with open(output_path, "a") as tweet_writer:
            tweet_writer.write(buffer_tweets[output_path][0])
-            increase_metadata_count(
-                os.path.join(output_dir, ".metadata.json"),
-                output_path, increase=buffer_tweets[output_path][1])
+
+
+def file_length(file_path: str) -> int:
+    """
+    Calculate number of lines of a file
+    """
+
+    with open(file_path) as f:
+        for i, l in enumerate(f):
+            pass
+        return i
+
+
+def generate_metadata_file(collection_path: str) -> None:
+    """
+    Once all the CSV files have been created, generate a metadata file
+    with information about the number of tweets in each of the CSVs by
+    making a simple count of lines
+    """
+
+    metadata = {}
+    metadata["files"] = {}
+    for root, dirs, files in os.walk(collection_path):
+        for f in files:
+            file_path = os.path.join(root, f)
+            relative_path = os.path.relpath(file_path, collection_path)
+            metadata["files"][relative_path] = {}
+            metadata["files"][relative_path]["count"] =\
+                file_length(file_path)
+    output_path = os.path.join(collection_path, ".metadata.json")
+    with open(output_path, 'w') as f:
+        json.dump(metadata, f)
--- a/pymongoexport_csv.py
+++ b/pymongoexport_csv.py
@@ -44,3 +44,5 @@ def write_page(
 # Make the computation
 with mp.Pool() as pool:
    pool.map(write_page, page_index)
+
+utils.generate_metadata_file(output_dir)
--- a/utils.py
+++ b/utils.py
-import os
-import json
-
-
-def create_task_database_structure(
-        output_dir: str,
-        db_name: str)\
-        -> str:
-    """
-    Generate the following directory tree: a top dir that will contain
-    all the tweet collections if it didn't exist yet and within it the top
-    directory for this task with a new and empty metadata file
-    """
-
-    # Create the root directory for the tweet collection
-    if not os.path.isdir(output_dir):
-        print(
-            "Building directory to contain the collected tweets at: "
-            + os.path.abspath(output_dir)
-        )
-        os.mkdir(output_dir)
-    collection_path = os.path.join(output_dir, db_name)
-    if not os.path.isdir(collection_path):
-        print("Initializing collection " + db_name + "...")
-        os.mkdir(collection_path)
-        generate_metadata_file(collection_path)
-
-    return collection_path
-
-
-def generate_metadata_file(collection_path) -> None:
-        print("Executing generate_metadata_file")
-        metadata_path = os.path.join(collection_path, ".metadata.json")
-        file_metadata = {}  # type: Dict
-        metadata = {}
-        metadata["files"] = file_metadata
-
-        with open(metadata_path, "w") as f:
-            json.dump(metadata, f)