Reformating

844fabe9 · serpucga · 2e50b803 · 844fabe9 · 844fabe9
Commit 844fabe9 authored Jul 16, 2019 by serpucga
Hide whitespace changes
Inline Side-by-side

Showing with 52 additions and 24 deletions

utils.py lib/utils.py +48 -17

pymongoexport_csv.py pymongoexport_csv.py +4 -7

No files found.
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -3,6 +3,7 @@ import re
 import pymongo
 import json
 from math import ceil
+from typing import List
 from tweet_manager.lib import json2csv, format_csv


@@ -76,17 +77,15 @@ def convert_tweet_to_csv(header: str, tweet: dict) -> str:
    return csv_appendable_line


-def get_page_index(collection, page_size: int):
-    return list(range(0, ceil(collection.count() / page_size)))
-
-
-def get_tweets_page(collection, page_size: int, num_page: int):
-    tweets = collection.find().skip(num_page * page_size).limit(page_size)
-    return tweets
-
-
-def write_tweets_to_files(host, port, database, pagesize, header: str,
-                          output_dir: str, page_index):
+def write_tweets_to_files(
+        host: str,
+        port: int,
+        database: str,
+        pagesize: int,
+        header: str,
+        output_dir: str,
+        page_index: list)\
+        -> None:
    print("Hi there! write_tweets_to_files executing")
    client = pymongo.MongoClient(host, port)
    database_tweets = client[database]["tweets"]
@@ -114,17 +113,38 @@ def write_tweets_to_files(host, port, database, pagesize, header: str,
            tweet_writer.write(buffer_tweets[output_path][0])


-def file_length(file_path: str) -> int:
+#########################
+#  TWEET DB PAGINATION  #
+#########################
+def get_page_index(
+        collection: pymongo.collection.Collection,
+        page_size: int)\
+        -> List[int]:
    """
-    Calculate number of lines of a file
+    Get an iterator with ints between 0 and N-1, where N is the number
+    of pages of the collection for the given page size
    """

-    with open(file_path) as f:
-        for i, l in enumerate(f):
-            pass
-        return i
+    return list(range(0, ceil(collection.count() / page_size)))


+def get_tweets_page(
+        collection: pymongo.collection.Collection,
+        page_size: int,
+        num_page: int)\
+        -> pymongo.cursor.Cursor:
+    """
+    Returns a pymongo cursor pointing to the MongoDB entries comprised
+    in the current page
+    """
+
+    tweets = collection.find().skip(num_page * page_size).limit(page_size)
+    return tweets
+
+
+#########################
+#  METADATA GENERATION  #
+#########################
 def generate_metadata_file(collection_path: str) -> None:
    """
    Once all the CSV files have been created, generate a metadata file
@@ -144,3 +164,14 @@ def generate_metadata_file(collection_path: str) -> None:
    output_path = os.path.join(collection_path, ".metadata.json")
    with open(output_path, 'w') as f:
        json.dump(metadata, f)
+
+
+def file_length(file_path: str) -> int:
+    """
+    Calculate number of lines of a file
+    """
+
+    with open(file_path) as f:
+        for i, l in enumerate(f):
+            pass
+        return i
--- a/pymongoexport_csv.py
+++ b/pymongoexport_csv.py
@@ -15,20 +15,17 @@ parser.add_argument("-s", "--pagesize", type=int, default=1000)
 parser.add_argument("database", type=str)
 args = parser.parse_args()

-# Dirs and files
+# Initialize some variables
 script_dir = os.path.dirname(__file__)
 output_dir = os.path.join(script_dir, "pymongodump", args.database)
 header_file = os.path.join(script_dir, "header.txt")
-
-# MongoDB connection
-client = pymongo.MongoClient(args.host, args.port)
-database_tweets = client[args.database]["tweets"]
-
 with open(header_file) as f:
    header = f.readline()
 buffer_tweets = {}
-num_page = 0

+# MongoDB connection to get page index
+client = pymongo.MongoClient(args.host, args.port)
+database_tweets = client[args.database]["tweets"]
 page_index = utils.get_page_index(database_tweets, args.pagesize)
 client.close()