Commit 95eb843f by serpucga

Script working (seemingly fine)

parent dcc81cb0
...@@ -5,12 +5,17 @@ import os ...@@ -5,12 +5,17 @@ import os
import argparse import argparse
import json import json
import re import re
import datetime
from email.utils import parsedate
from tweet_manager.lib import json2csv, format_csv from tweet_manager.lib import json2csv, format_csv
def parse_datetime(string):
return datetime.datetime(*(parsedate(string)[:6]))
def create_task_database_structure( def create_task_database_structure(
output_dir: str, output_dir: str)\
db_name: str)\
-> str: -> str:
""" """
Generate the following directory tree: a top dir that will contain Generate the following directory tree: a top dir that will contain
...@@ -19,6 +24,7 @@ def create_task_database_structure( ...@@ -19,6 +24,7 @@ def create_task_database_structure(
""" """
# Create the root directory for the tweet collection # Create the root directory for the tweet collection
(output_dir, db_name) = os.path.split(output_dir)
if not os.path.isdir(output_dir): if not os.path.isdir(output_dir):
print( print(
"Building directory to contain the collected tweets at: " "Building directory to contain the collected tweets at: "
...@@ -67,12 +73,35 @@ def add_newfile_to_metadata(file_path: str, metadata_path: str) -> None: ...@@ -67,12 +73,35 @@ def add_newfile_to_metadata(file_path: str, metadata_path: str) -> None:
add_newfile_to_metadata(file_path, metadata_path) add_newfile_to_metadata(file_path, metadata_path)
def increase_metadata_count(
metadata_path: str,
file_path: str,
increase: int = 1)\
-> None:
"""
Use this when one tweet is appended to one of the CSVs in the
collection. This function will update the metadata file by increasing
by x the corresponding dictionary structure
"""
print("Executing increase_metadata_count")
try:
with open(metadata_path, "r+") as f:
metadata_file = json.load(f)
metadata_file["files"][file_path]["count"] += increase
f.seek(0)
f.truncate()
json.dump(metadata_file, f)
except IOError:
generate_metadata_file(metadata_path)
increase_metadata_count(metadata_path, file_path, increase)
def create_tweet_output_path( def create_tweet_output_path(
tweet: dict, tweet: dict,
output_dir: str, output_dir: str)\
db_name: str)\
-> str: -> str:
collection_path = create_task_database_structure(output_dir, db_name) collection_path = create_task_database_structure(output_dir)
# Extract year, month and date from the tweet using a regex # Extract year, month and date from the tweet using a regex
matchObj = re.search( matchObj = re.search(
...@@ -99,23 +128,20 @@ def create_tweet_output_path( ...@@ -99,23 +128,20 @@ def create_tweet_output_path(
return tweet_output_file return tweet_output_file
#
# def convert_tweet_to_csv(tweet: dict) -> str: def convert_tweet_to_csv(header: str, tweet: dict) -> str:
# # Flatten the tweet and store it in status_flat # Flatten the tweet and store it in status_flat
# status_flat = json2csv.flatten_dictionary(tweet) status_flat = json2csv.flatten_dictionary(tweet)
#
# # Convert the flat JSON to CSV format # Convert the flat JSON to CSV format
# # 1st arg: flat tweet, 2nd arg: activate array compression, 3rd arg: # 1st arg: flat tweet, 2nd arg: activate array compression, 3rd arg:
# # number of array compression levels, 4th arg: remove dollars mode # number of array compression levels, 4th arg: remove dollars mode
# status_csv = json2csv.json2csv(status_flat, True, 5, False) status_csv = json2csv.json2csv(status_flat, True, 5, False)
#
# # Get the default header with the fields to keep csv_appendable_line = format_csv.get_csv_line(header, status_csv)
# with open(config.CSV_HEADER) as f:
# header = f.readline() return csv_appendable_line
#
# csv_appendable_line = format_csv.get_csv_line(header, status_csv)
#
# return csv_appendable_line
if __name__ == '__main__': if __name__ == '__main__':
...@@ -129,7 +155,7 @@ if __name__ == '__main__': ...@@ -129,7 +155,7 @@ if __name__ == '__main__':
# Dirs and files # Dirs and files
script_dir = os.path.dirname(__file__) script_dir = os.path.dirname(__file__)
output_dir = os.path.join(script_dir, "pymongodump") output_dir = os.path.join(script_dir, "pymongodump", args.database)
header_file = os.path.join(script_dir, "header.txt") header_file = os.path.join(script_dir, "header.txt")
# MongoDB connection # MongoDB connection
...@@ -138,11 +164,28 @@ if __name__ == '__main__': ...@@ -138,11 +164,28 @@ if __name__ == '__main__':
with open(header_file) as f: with open(header_file) as f:
header = f.readline() header = f.readline()
for tweet in database_tweets.find():
create_tweet_output_path(tweet, output_dir, args.database)
# flat_tweet = json2csv.flatten_dictionary(tweet) buffer_tweets = {}
# csv_rawline = json2csv.json2csv(flat_tweet, True, 5, False) for tweet in database_tweets.find():
# csv_appendable_line =\ # Get output path and contents for the new CSV file
# format_csv.get_csv_line(header, csv_rawline) csv_tweet_output_path =\
create_tweet_output_path(tweet, output_dir)
csv_tweet_contents =\
"\n" + str(convert_tweet_to_csv(header, tweet))
# Check if buffer exists for the file. If not, add to dictionary
if csv_tweet_output_path not in buffer_tweets.keys():
buffer_tweets[csv_tweet_output_path] = ["", 0]
# Update the buffer adding the tweet and increasing tweet count
buffer_tweets[csv_tweet_output_path][0] += csv_tweet_contents
buffer_tweets[csv_tweet_output_path][1] += 1
# Perform the write operations in each of the files
for output_path in buffer_tweets.keys():
with open(output_path, "a") as tweet_writer:
tweet_writer.write(buffer_tweets[output_path][0])
increase_metadata_count(
os.path.join(output_dir, ".metadata.json"),
output_path, increase=buffer_tweets[output_path][1]
)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment