Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
migration_scripts
Project
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Registry
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
serpucga
migration_scripts
Commits
844fabe9
Commit
844fabe9
authored
Jul 16, 2019
by
serpucga
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Reformating
parent
2e50b803
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
52 additions
and
24 deletions
+52
-24
utils.py
lib/utils.py
+48
-17
pymongoexport_csv.py
pymongoexport_csv.py
+4
-7
No files found.
lib/utils.py
View file @
844fabe9
...
...
@@ -3,6 +3,7 @@ import re
import
pymongo
import
json
from
math
import
ceil
from
typing
import
List
from
tweet_manager.lib
import
json2csv
,
format_csv
...
...
@@ -76,17 +77,15 @@ def convert_tweet_to_csv(header: str, tweet: dict) -> str:
return
csv_appendable_line
def
get_page_index
(
collection
,
page_size
:
int
):
return
list
(
range
(
0
,
ceil
(
collection
.
count
()
/
page_size
)))
def
get_tweets_page
(
collection
,
page_size
:
int
,
num_page
:
int
):
tweets
=
collection
.
find
()
.
skip
(
num_page
*
page_size
)
.
limit
(
page_size
)
return
tweets
def
write_tweets_to_files
(
host
,
port
,
database
,
pagesize
,
header
:
str
,
output_dir
:
str
,
page_index
):
def
write_tweets_to_files
(
host
:
str
,
port
:
int
,
database
:
str
,
pagesize
:
int
,
header
:
str
,
output_dir
:
str
,
page_index
:
list
)
\
->
None
:
print
(
"Hi there! write_tweets_to_files executing"
)
client
=
pymongo
.
MongoClient
(
host
,
port
)
database_tweets
=
client
[
database
][
"tweets"
]
...
...
@@ -114,17 +113,38 @@ def write_tweets_to_files(host, port, database, pagesize, header: str,
tweet_writer
.
write
(
buffer_tweets
[
output_path
][
0
])
def
file_length
(
file_path
:
str
)
->
int
:
#########################
# TWEET DB PAGINATION #
#########################
def
get_page_index
(
collection
:
pymongo
.
collection
.
Collection
,
page_size
:
int
)
\
->
List
[
int
]:
"""
Calculate number of lines of a file
Get an iterator with ints between 0 and N-1, where N is the number
of pages of the collection for the given page size
"""
with
open
(
file_path
)
as
f
:
for
i
,
l
in
enumerate
(
f
):
pass
return
i
return
list
(
range
(
0
,
ceil
(
collection
.
count
()
/
page_size
)))
def
get_tweets_page
(
collection
:
pymongo
.
collection
.
Collection
,
page_size
:
int
,
num_page
:
int
)
\
->
pymongo
.
cursor
.
Cursor
:
"""
Returns a pymongo cursor pointing to the MongoDB entries comprised
in the current page
"""
tweets
=
collection
.
find
()
.
skip
(
num_page
*
page_size
)
.
limit
(
page_size
)
return
tweets
#########################
# METADATA GENERATION #
#########################
def
generate_metadata_file
(
collection_path
:
str
)
->
None
:
"""
Once all the CSV files have been created, generate a metadata file
...
...
@@ -144,3 +164,14 @@ def generate_metadata_file(collection_path: str) -> None:
output_path
=
os
.
path
.
join
(
collection_path
,
".metadata.json"
)
with
open
(
output_path
,
'w'
)
as
f
:
json
.
dump
(
metadata
,
f
)
def
file_length
(
file_path
:
str
)
->
int
:
"""
Calculate number of lines of a file
"""
with
open
(
file_path
)
as
f
:
for
i
,
l
in
enumerate
(
f
):
pass
return
i
pymongoexport_csv.py
View file @
844fabe9
...
...
@@ -15,20 +15,17 @@ parser.add_argument("-s", "--pagesize", type=int, default=1000)
parser
.
add_argument
(
"database"
,
type
=
str
)
args
=
parser
.
parse_args
()
#
Dirs and fi
les
#
Initialize some variab
les
script_dir
=
os
.
path
.
dirname
(
__file__
)
output_dir
=
os
.
path
.
join
(
script_dir
,
"pymongodump"
,
args
.
database
)
header_file
=
os
.
path
.
join
(
script_dir
,
"header.txt"
)
# MongoDB connection
client
=
pymongo
.
MongoClient
(
args
.
host
,
args
.
port
)
database_tweets
=
client
[
args
.
database
][
"tweets"
]
with
open
(
header_file
)
as
f
:
header
=
f
.
readline
()
buffer_tweets
=
{}
num_page
=
0
# MongoDB connection to get page index
client
=
pymongo
.
MongoClient
(
args
.
host
,
args
.
port
)
database_tweets
=
client
[
args
.
database
][
"tweets"
]
page_index
=
utils
.
get_page_index
(
database_tweets
,
args
.
pagesize
)
client
.
close
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment