Commit 2e50b803 by serpucga

Changed way of generating metadata file

Before it was done in the same way than it is done in UTool, by increasing an entry in the metadata file by X each time that X tweets are added to that CSV. However, for a script that converts from Mongo to CSV static collections that are not growing in size, it is just better to just count the number of lines of each CSV file once that the conversion process has ended. This also supresses the risk of the metadata being corrupted due to bad parallelization
parent 1f695bf1
import os
import json
import re
import pymongo
import json
from math import ceil
from tweet_manager.lib import json2csv, format_csv
......@@ -12,7 +12,7 @@ def create_task_database_structure(
"""
Generate the following directory tree: a top dir that will contain
all the tweet collections if it didn't exist yet and within it the top
directory for this task with a new and empty metadata file
directory for this task
"""
# Create the root directory for the tweet collection
......@@ -27,68 +27,10 @@ def create_task_database_structure(
if not os.path.isdir(collection_path):
print("Initializing collection " + db_name + "...")
os.mkdir(collection_path)
metadata_path = os.path.join(collection_path, ".metadata.json")
generate_metadata_file(metadata_path)
return collection_path
def generate_metadata_file(metadata_path) -> None:
print("Executing generate_metadata_file")
file_metadata = {}
metadata = {}
metadata["files"] = file_metadata
with open(metadata_path, "w") as f:
json.dump(metadata, f)
def add_newfile_to_metadata(file_path: str, metadata_path: str) -> None:
"""
Add a new dictionary structure to the metadata file that contains
information about a newly added CSV. This should just be user for files
that have just been added to the collection, because it initializes the
count to 0
"""
print("Executing add_newfile_to_metadata")
try:
with open(metadata_path, "r+") as f:
metadata_file = json.load(f)
metadata_file["files"][file_path] = {}
metadata_file["files"][file_path]["count"] = 0
f.seek(0)
f.truncate()
json.dump(metadata_file, f)
except IOError:
generate_metadata_file(metadata_path)
add_newfile_to_metadata(file_path, metadata_path)
def increase_metadata_count(
metadata_path: str,
file_path: str,
increase: int = 1)\
-> None:
"""
Use this when one tweet is appended to one of the CSVs in the
collection. This function will update the metadata file by increasing
by x the corresponding dictionary structure
"""
print("Executing increase_metadata_count")
try:
with open(metadata_path, "r+") as f:
metadata_file = json.load(f)
metadata_file["files"][file_path]["count"] += increase
f.seek(0)
f.truncate()
json.dump(metadata_file, f)
except IOError:
generate_metadata_file(metadata_path)
increase_metadata_count(metadata_path, file_path, increase)
def create_tweet_output_path(
header: str,
tweet: dict,
......@@ -116,9 +58,6 @@ def create_tweet_output_path(
if os.path.isfile(tweet_output_file) is False:
with open(tweet_output_file, "w") as fw:
fw.write(header.strip())
add_newfile_to_metadata(
tweet_output_file,
os.path.join(collection_path, ".metadata.json"))
return tweet_output_file
......@@ -166,13 +105,42 @@ def write_tweets_to_files(host, port, database, pagesize, header: str,
# Update the buffer adding the tweet and increasing tweet count
buffer_tweets[csv_tweet_output_path][0] += csv_tweet_contents
buffer_tweets[csv_tweet_output_path][1] += 1
# buffer_tweets[csv_tweet_output_path][1] += 1
client.close()
# Perform the write operations in each of the files
for output_path in buffer_tweets.keys():
with open(output_path, "a") as tweet_writer:
tweet_writer.write(buffer_tweets[output_path][0])
increase_metadata_count(
os.path.join(output_dir, ".metadata.json"),
output_path, increase=buffer_tweets[output_path][1])
def file_length(file_path: str) -> int:
"""
Calculate number of lines of a file
"""
with open(file_path) as f:
for i, l in enumerate(f):
pass
return i
def generate_metadata_file(collection_path: str) -> None:
"""
Once all the CSV files have been created, generate a metadata file
with information about the number of tweets in each of the CSVs by
making a simple count of lines
"""
metadata = {}
metadata["files"] = {}
for root, dirs, files in os.walk(collection_path):
for f in files:
file_path = os.path.join(root, f)
relative_path = os.path.relpath(file_path, collection_path)
metadata["files"][relative_path] = {}
metadata["files"][relative_path]["count"] =\
file_length(file_path)
output_path = os.path.join(collection_path, ".metadata.json")
with open(output_path, 'w') as f:
json.dump(metadata, f)
......@@ -44,3 +44,5 @@ def write_page(
# Make the computation
with mp.Pool() as pool:
pool.map(write_page, page_index)
utils.generate_metadata_file(output_dir)
import os
import json
def create_task_database_structure(
output_dir: str,
db_name: str)\
-> str:
"""
Generate the following directory tree: a top dir that will contain
all the tweet collections if it didn't exist yet and within it the top
directory for this task with a new and empty metadata file
"""
# Create the root directory for the tweet collection
if not os.path.isdir(output_dir):
print(
"Building directory to contain the collected tweets at: "
+ os.path.abspath(output_dir)
)
os.mkdir(output_dir)
collection_path = os.path.join(output_dir, db_name)
if not os.path.isdir(collection_path):
print("Initializing collection " + db_name + "...")
os.mkdir(collection_path)
generate_metadata_file(collection_path)
return collection_path
def generate_metadata_file(collection_path) -> None:
print("Executing generate_metadata_file")
metadata_path = os.path.join(collection_path, ".metadata.json")
file_metadata = {} # type: Dict
metadata = {}
metadata["files"] = file_metadata
with open(metadata_path, "w") as f:
json.dump(metadata, f)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment