Commit dcc81cb0 by serpucga

Exporting to JSON and to CSV separated. Implemented creation of filesystem tree to store the CSVs

parent 56c27157
id,text,created_at,source,truncated,in_reply_to_status_id,in_reply_to_user_id,in_reply_to_screen_name,quoted_status_id,is_quote_status,retweet_count,favorite_count,user.id,user.name,user.created_at,user.screen_name,user.location,user.profile_image_url,user.verified,user.followers_count,user.friends_count,user.listed_count,user.favourites_count,user.statuses_count,user.geo_enabled,user.lang,entities.hashtags.text,entities.urls.expanded_url,entities.user_mentions.screen_name,entities.media.media_url,place.id,place.name,place.full_name,place.country,place.country_code,place.place_type,place.url,place.bounding_box.type,place.bounding_box.coordinates,coordinates.type,coordinates.coordinates
#!/usr/bin/env python
import pymongo
import os
import argparse
import json
import re
from tweet_manager.lib import json2csv, format_csv
def create_task_database_structure(
output_dir: str,
db_name: str)\
-> str:
"""
Generate the following directory tree: a top dir that will contain
all the tweet collections if it didn't exist yet and within it the top
directory for this task with a new and empty metadata file
"""
# Create the root directory for the tweet collection
if not os.path.isdir(output_dir):
print(
"Building directory to contain the collected tweets at: "
+ os.path.abspath(output_dir)
)
os.mkdir(output_dir)
collection_path = os.path.join(output_dir, db_name)
if not os.path.isdir(collection_path):
print("Initializing collection " + db_name + "...")
os.mkdir(collection_path)
metadata_path = os.path.join(collection_path, ".metadata.json")
generate_metadata_file(metadata_path)
return collection_path
def generate_metadata_file(metadata_path) -> None:
print("Executing generate_metadata_file")
file_metadata = {} # type: Dict
metadata = {}
metadata["files"] = file_metadata
with open(metadata_path, "w") as f:
json.dump(metadata, f)
def add_newfile_to_metadata(file_path: str, metadata_path: str) -> None:
"""
Add a new dictionary structure to the metadata file that contains
information about a newly added CSV. This should just be user for files
that have just been added to the collection, because it initializes the
count to 0
"""
print("Executing add_newfile_to_metadata")
try:
with open(metadata_path, "r+") as f:
metadata_file = json.load(f)
metadata_file["files"][file_path] = {}
metadata_file["files"][file_path]["count"] = 0
f.seek(0)
f.truncate()
json.dump(metadata_file, f)
except IOError:
generate_metadata_file(metadata_path)
add_newfile_to_metadata(file_path, metadata_path)
def create_tweet_output_path(
tweet: dict,
output_dir: str,
db_name: str)\
-> str:
collection_path = create_task_database_structure(output_dir, db_name)
# Extract year, month and date from the tweet using a regex
matchObj = re.search(
r"^(\d{4})-(\d{2})-(\d{2}) \d{2}:\d{2}:\d{2}$", str(tweet["created_at"])
)
year = matchObj.group(1)
month = matchObj.group(2)
day = matchObj.group(3)
date = (year, month, "")
# Classify the tweet chronologically
tweet_output_path = json2csv.mkdir_tweet_date(date, collection_path)
tweet_output_file = os.path.join(tweet_output_path, day + ".csv")
# If the CSV file didn't already exist, initialize it with a header
if os.path.isfile(tweet_output_file) is False:
with open(header_file) as f:
header = f.readline().strip()
with open(tweet_output_file, "w") as fw:
fw.write(header)
add_newfile_to_metadata(
tweet_output_file,
os.path.join(collection_path, ".metadata.json"))
return tweet_output_file
#
# def convert_tweet_to_csv(tweet: dict) -> str:
# # Flatten the tweet and store it in status_flat
# status_flat = json2csv.flatten_dictionary(tweet)
#
# # Convert the flat JSON to CSV format
# # 1st arg: flat tweet, 2nd arg: activate array compression, 3rd arg:
# # number of array compression levels, 4th arg: remove dollars mode
# status_csv = json2csv.json2csv(status_flat, True, 5, False)
#
# # Get the default header with the fields to keep
# with open(config.CSV_HEADER) as f:
# header = f.readline()
#
# csv_appendable_line = format_csv.get_csv_line(header, status_csv)
#
# return csv_appendable_line
if __name__ == '__main__':
# Command line parsing
parser = argparse.ArgumentParser(
description="Dump the tweets of a database to a JSON file")
parser.add_argument("-H", "--host", type=str, default="localhost")
parser.add_argument("-p", "--port", type=int, default=27017)
parser.add_argument("database", type=str)
args = parser.parse_args()
# Dirs and files
script_dir = os.path.dirname(__file__)
output_dir = os.path.join(script_dir, "pymongodump")
header_file = os.path.join(script_dir, "header.txt")
# MongoDB connection
client = pymongo.MongoClient(args.host, args.port)
database_tweets = client[args.database]["tweets"]
with open(header_file) as f:
header = f.readline()
for tweet in database_tweets.find():
create_tweet_output_path(tweet, output_dir, args.database)
# flat_tweet = json2csv.flatten_dictionary(tweet)
# csv_rawline = json2csv.json2csv(flat_tweet, True, 5, False)
# csv_appendable_line =\
# format_csv.get_csv_line(header, csv_rawline)
......@@ -3,6 +3,8 @@
import pymongo
import os
import argparse
import pprint
from tweet_manager.lib.json2csv import flatten
parser = argparse.ArgumentParser(
description="Dump the tweets of a database to a JSON file")
......
import os
import json
def create_task_database_structure(
output_dir: str,
db_name: str)\
-> str:
"""
Generate the following directory tree: a top dir that will contain
all the tweet collections if it didn't exist yet and within it the top
directory for this task with a new and empty metadata file
"""
# Create the root directory for the tweet collection
if not os.path.isdir(output_dir):
print(
"Building directory to contain the collected tweets at: "
+ os.path.abspath(output_dir)
)
os.mkdir(output_dir)
collection_path = os.path.join(output_dir, db_name)
if not os.path.isdir(collection_path):
print("Initializing collection " + db_name + "...")
os.mkdir(collection_path)
generate_metadata_file(collection_path)
return collection_path
def generate_metadata_file(collection_path) -> None:
print("Executing generate_metadata_file")
metadata_path = os.path.join(collection_path, ".metadata.json")
file_metadata = {} # type: Dict
metadata = {}
metadata["files"] = file_metadata
with open(metadata_path, "w") as f:
json.dump(metadata, f)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment