Commit 844fabe9 by serpucga

Reformating

parent 2e50b803
...@@ -3,6 +3,7 @@ import re ...@@ -3,6 +3,7 @@ import re
import pymongo import pymongo
import json import json
from math import ceil from math import ceil
from typing import List
from tweet_manager.lib import json2csv, format_csv from tweet_manager.lib import json2csv, format_csv
...@@ -76,17 +77,15 @@ def convert_tweet_to_csv(header: str, tweet: dict) -> str: ...@@ -76,17 +77,15 @@ def convert_tweet_to_csv(header: str, tweet: dict) -> str:
return csv_appendable_line return csv_appendable_line
def get_page_index(collection, page_size: int): def write_tweets_to_files(
return list(range(0, ceil(collection.count() / page_size))) host: str,
port: int,
database: str,
def get_tweets_page(collection, page_size: int, num_page: int): pagesize: int,
tweets = collection.find().skip(num_page * page_size).limit(page_size) header: str,
return tweets output_dir: str,
page_index: list)\
-> None:
def write_tweets_to_files(host, port, database, pagesize, header: str,
output_dir: str, page_index):
print("Hi there! write_tweets_to_files executing") print("Hi there! write_tweets_to_files executing")
client = pymongo.MongoClient(host, port) client = pymongo.MongoClient(host, port)
database_tweets = client[database]["tweets"] database_tweets = client[database]["tweets"]
...@@ -114,17 +113,38 @@ def write_tweets_to_files(host, port, database, pagesize, header: str, ...@@ -114,17 +113,38 @@ def write_tweets_to_files(host, port, database, pagesize, header: str,
tweet_writer.write(buffer_tweets[output_path][0]) tweet_writer.write(buffer_tweets[output_path][0])
def file_length(file_path: str) -> int: #########################
# TWEET DB PAGINATION #
#########################
def get_page_index(
collection: pymongo.collection.Collection,
page_size: int)\
-> List[int]:
""" """
Calculate number of lines of a file Get an iterator with ints between 0 and N-1, where N is the number
of pages of the collection for the given page size
""" """
with open(file_path) as f: return list(range(0, ceil(collection.count() / page_size)))
for i, l in enumerate(f):
pass
return i
def get_tweets_page(
collection: pymongo.collection.Collection,
page_size: int,
num_page: int)\
-> pymongo.cursor.Cursor:
"""
Returns a pymongo cursor pointing to the MongoDB entries comprised
in the current page
"""
tweets = collection.find().skip(num_page * page_size).limit(page_size)
return tweets
#########################
# METADATA GENERATION #
#########################
def generate_metadata_file(collection_path: str) -> None: def generate_metadata_file(collection_path: str) -> None:
""" """
Once all the CSV files have been created, generate a metadata file Once all the CSV files have been created, generate a metadata file
...@@ -144,3 +164,14 @@ def generate_metadata_file(collection_path: str) -> None: ...@@ -144,3 +164,14 @@ def generate_metadata_file(collection_path: str) -> None:
output_path = os.path.join(collection_path, ".metadata.json") output_path = os.path.join(collection_path, ".metadata.json")
with open(output_path, 'w') as f: with open(output_path, 'w') as f:
json.dump(metadata, f) json.dump(metadata, f)
def file_length(file_path: str) -> int:
"""
Calculate number of lines of a file
"""
with open(file_path) as f:
for i, l in enumerate(f):
pass
return i
...@@ -15,20 +15,17 @@ parser.add_argument("-s", "--pagesize", type=int, default=1000) ...@@ -15,20 +15,17 @@ parser.add_argument("-s", "--pagesize", type=int, default=1000)
parser.add_argument("database", type=str) parser.add_argument("database", type=str)
args = parser.parse_args() args = parser.parse_args()
# Dirs and files # Initialize some variables
script_dir = os.path.dirname(__file__) script_dir = os.path.dirname(__file__)
output_dir = os.path.join(script_dir, "pymongodump", args.database) output_dir = os.path.join(script_dir, "pymongodump", args.database)
header_file = os.path.join(script_dir, "header.txt") header_file = os.path.join(script_dir, "header.txt")
# MongoDB connection
client = pymongo.MongoClient(args.host, args.port)
database_tweets = client[args.database]["tweets"]
with open(header_file) as f: with open(header_file) as f:
header = f.readline() header = f.readline()
buffer_tweets = {} buffer_tweets = {}
num_page = 0
# MongoDB connection to get page index
client = pymongo.MongoClient(args.host, args.port)
database_tweets = client[args.database]["tweets"]
page_index = utils.get_page_index(database_tweets, args.pagesize) page_index = utils.get_page_index(database_tweets, args.pagesize)
client.close() client.close()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment