Commit 95eb843f by serpucga

Script working (seemingly fine)

parent dcc81cb0
......@@ -5,12 +5,17 @@ import os
import argparse
import json
import re
import datetime
from email.utils import parsedate
from tweet_manager.lib import json2csv, format_csv
def parse_datetime(string):
return datetime.datetime(*(parsedate(string)[:6]))
def create_task_database_structure(
output_dir: str,
db_name: str)\
output_dir: str)\
-> str:
"""
Generate the following directory tree: a top dir that will contain
......@@ -19,6 +24,7 @@ def create_task_database_structure(
"""
# Create the root directory for the tweet collection
(output_dir, db_name) = os.path.split(output_dir)
if not os.path.isdir(output_dir):
print(
"Building directory to contain the collected tweets at: "
......@@ -67,12 +73,35 @@ def add_newfile_to_metadata(file_path: str, metadata_path: str) -> None:
add_newfile_to_metadata(file_path, metadata_path)
def increase_metadata_count(
metadata_path: str,
file_path: str,
increase: int = 1)\
-> None:
"""
Use this when one tweet is appended to one of the CSVs in the
collection. This function will update the metadata file by increasing
by x the corresponding dictionary structure
"""
print("Executing increase_metadata_count")
try:
with open(metadata_path, "r+") as f:
metadata_file = json.load(f)
metadata_file["files"][file_path]["count"] += increase
f.seek(0)
f.truncate()
json.dump(metadata_file, f)
except IOError:
generate_metadata_file(metadata_path)
increase_metadata_count(metadata_path, file_path, increase)
def create_tweet_output_path(
tweet: dict,
output_dir: str,
db_name: str)\
output_dir: str)\
-> str:
collection_path = create_task_database_structure(output_dir, db_name)
collection_path = create_task_database_structure(output_dir)
# Extract year, month and date from the tweet using a regex
matchObj = re.search(
......@@ -99,23 +128,20 @@ def create_tweet_output_path(
return tweet_output_file
#
# def convert_tweet_to_csv(tweet: dict) -> str:
# # Flatten the tweet and store it in status_flat
# status_flat = json2csv.flatten_dictionary(tweet)
#
# # Convert the flat JSON to CSV format
# # 1st arg: flat tweet, 2nd arg: activate array compression, 3rd arg:
# # number of array compression levels, 4th arg: remove dollars mode
# status_csv = json2csv.json2csv(status_flat, True, 5, False)
#
# # Get the default header with the fields to keep
# with open(config.CSV_HEADER) as f:
# header = f.readline()
#
# csv_appendable_line = format_csv.get_csv_line(header, status_csv)
#
# return csv_appendable_line
def convert_tweet_to_csv(header: str, tweet: dict) -> str:
# Flatten the tweet and store it in status_flat
status_flat = json2csv.flatten_dictionary(tweet)
# Convert the flat JSON to CSV format
# 1st arg: flat tweet, 2nd arg: activate array compression, 3rd arg:
# number of array compression levels, 4th arg: remove dollars mode
status_csv = json2csv.json2csv(status_flat, True, 5, False)
csv_appendable_line = format_csv.get_csv_line(header, status_csv)
return csv_appendable_line
if __name__ == '__main__':
......@@ -129,7 +155,7 @@ if __name__ == '__main__':
# Dirs and files
script_dir = os.path.dirname(__file__)
output_dir = os.path.join(script_dir, "pymongodump")
output_dir = os.path.join(script_dir, "pymongodump", args.database)
header_file = os.path.join(script_dir, "header.txt")
# MongoDB connection
......@@ -138,11 +164,28 @@ if __name__ == '__main__':
with open(header_file) as f:
header = f.readline()
for tweet in database_tweets.find():
create_tweet_output_path(tweet, output_dir, args.database)
# flat_tweet = json2csv.flatten_dictionary(tweet)
# csv_rawline = json2csv.json2csv(flat_tweet, True, 5, False)
# csv_appendable_line =\
# format_csv.get_csv_line(header, csv_rawline)
buffer_tweets = {}
for tweet in database_tweets.find():
# Get output path and contents for the new CSV file
csv_tweet_output_path =\
create_tweet_output_path(tweet, output_dir)
csv_tweet_contents =\
"\n" + str(convert_tweet_to_csv(header, tweet))
# Check if buffer exists for the file. If not, add to dictionary
if csv_tweet_output_path not in buffer_tweets.keys():
buffer_tweets[csv_tweet_output_path] = ["", 0]
# Update the buffer adding the tweet and increasing tweet count
buffer_tweets[csv_tweet_output_path][0] += csv_tweet_contents
buffer_tweets[csv_tweet_output_path][1] += 1
# Perform the write operations in each of the files
for output_path in buffer_tweets.keys():
with open(output_path, "a") as tweet_writer:
tweet_writer.write(buffer_tweets[output_path][0])
increase_metadata_count(
os.path.join(output_dir, ".metadata.json"),
output_path, increase=buffer_tweets[output_path][1]
)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment