Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
migration_scripts
Project
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Registry
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
serpucga
migration_scripts
Commits
888acbe2
Commit
888acbe2
authored
Jul 22, 2019
by
serpucga
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'feature/fault_tolerance' into develop
parents
36d7a65a
ab69fb73
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
269 additions
and
61 deletions
+269
-61
.gitignore
.gitignore
+2
-0
utils.py
lib/utils.py
+218
-50
pymongoexport_csv.py
pymongoexport_csv.py
+49
-11
No files found.
.gitignore
View file @
888acbe2
pymongodump
recovery
tests.py
.mypy_cache
.recovery*
lib/utils.py
View file @
888acbe2
...
...
@@ -3,6 +3,7 @@ import pymongo
import
json
import
re
import
time
import
datetime
import
multiprocessing
as
mp
from
math
import
ceil
from
typing
import
List
...
...
@@ -15,7 +16,16 @@ import logging
logger
=
logging
.
getLogger
(
__name__
)
def
filesystem_writer
(
queue
:
mp
.
Queue
,
header
:
str
)
->
None
:
def
filesystem_writer
(
queue
:
mp
.
Queue
,
header
:
str
,
host
:
str
,
port
:
int
,
database
:
str
,
pagesize
:
int
,
output_dir
:
str
,
recovery_file
:
str
)
\
->
None
:
"""
Reads the CSV pages from the queue and writes them to filesystem
...
...
@@ -31,30 +41,47 @@ def filesystem_writer(queue: mp.Queue, header: str) -> None:
logger
.
debug
(
"Worker {} launched: filesystem_writer executing"
.
format
(
os
.
getpid
()))
if
recovery_file
:
recovery_file_path
=
recovery_file
else
:
recovery_file_path
=
build_recovery_filepath
(
database
)
create_recovery_file
(
recovery_file_path
,
host
,
port
,
database
,
pagesize
)
while
True
:
csv_page
=
queue
.
get
()
page_number
,
csv_page
=
queue
.
get
()
if
csv_page
==
"END"
:
logger
.
info
(
"Writing loop finished"
)
os
.
remove
(
recovery_file_path
)
break
if
globals
.
timing
and
csv_page
is
not
None
:
time0
=
time
.
time
()
for
output_path
in
csv_page
.
keys
():
logger
.
debug
(
"Dumping tweets for "
+
output_path
)
if
os
.
path
.
isfile
(
output_path
):
with
open
(
output_path
,
"a"
)
as
writer
:
writer
.
write
(
csv_page
[
output_path
])
elif
csv_page
==
"ERROR"
:
logger
.
error
(
"Dumping recovery file and exiting"
)
if
recovery_file_path
>=
0
:
dump_error_recovery_file
(
recovery_file_path
,
page_number
)
break
elif
csv_page
is
not
None
:
if
globals
.
timing
:
time0
=
time
.
time
()
for
output_path
in
csv_page
.
keys
():
logger
.
debug
(
"Dumping tweets for "
+
output_path
)
if
os
.
path
.
isfile
(
output_path
):
with
open
(
output_path
,
"a"
)
as
writer
:
writer
.
write
(
csv_page
[
output_path
])
else
:
logger
.
debug
(
"File {} not found, generating new..."
.
format
(
output_path
))
generate_path
(
output_path
,
header
)
with
open
(
output_path
,
"a"
)
as
writer
:
writer
.
write
(
csv_page
[
output_path
])
if
page_number
>=
0
:
update_recovery_file
(
recovery_file_path
,
page_number
)
if
globals
.
timing
:
logger
.
critical
(
"Time spent writing tweet page to FS: {}s"
.
format
(
time
.
time
()
-
time0
))
else
:
logger
.
debug
(
"File {} not found, generating new..."
.
format
(
output_path
))
generate_path
(
output_path
,
header
)
with
open
(
output_path
,
"a"
)
as
writer
:
writer
.
write
(
csv_page
[
output_path
])
if
globals
.
timing
and
csv_page
is
not
None
:
time1
=
time
.
time
()
logger
.
critical
(
"Time spent writing tweet page to FS: {}s"
.
format
(
time1
-
time0
))
continue
def
process_page
(
...
...
@@ -80,35 +107,45 @@ def process_page(
:param queue: queue were processed data await to be written to FS
"""
logger
.
debug
(
"Worker {} launched: process_page executing"
.
format
(
os
.
getpid
()))
if
globals
.
timing
:
time0
=
time
.
time
()
client
=
pymongo
.
MongoClient
(
host
,
port
)
database_tweets
=
client
[
database
][
"tweets"
]
tweets_page
=
get_tweets_page
(
database_tweets
,
pagesize
,
page_number
)
buffer_tweets
=
{}
for
tweet
in
tweets_page
:
csv_tweet_output_path
=
\
get_tweet_output_path
(
tweet
,
output_dir
)
csv_tweet_contents
=
\
"
\n
"
+
str
(
convert_tweet_to_csv
(
header
,
tweet
))
if
csv_tweet_output_path
not
in
buffer_tweets
.
keys
():
buffer_tweets
[
csv_tweet_output_path
]
=
""
buffer_tweets
[
csv_tweet_output_path
]
+=
csv_tweet_contents
client
.
close
()
try
:
logger
.
debug
(
"Worker {} launched: process_page executing"
.
format
(
os
.
getpid
()))
if
globals
.
timing
:
time0
=
time
.
time
()
queue
.
put
(
buffer_tweets
)
client
=
pymongo
.
MongoClient
(
host
,
port
)
database_tweets
=
client
[
database
][
"tweets"
]
tweets_page
=
get_tweets_page
(
database_tweets
,
pagesize
,
page_number
)
buffer_tweets
=
{}
for
tweet
in
tweets_page
:
csv_tweet_output_path
=
\
get_tweet_output_path
(
tweet
,
output_dir
)
try
:
csv_tweet_contents
=
\
"
\n
"
+
str
(
convert_tweet_to_csv
(
header
,
tweet
))
except
TweetConversionException
as
exc
:
logger
.
error
(
exc
.
message
)
logger
.
error
(
"Origin tweet:
\n
"
+
str
(
exc
.
tweet
))
logger
.
error
(
"Discarding tweet and proceeding..."
)
continue
if
csv_tweet_output_path
not
in
buffer_tweets
.
keys
():
buffer_tweets
[
csv_tweet_output_path
]
=
""
buffer_tweets
[
csv_tweet_output_path
]
+=
csv_tweet_contents
client
.
close
()
queue
.
put
((
page_number
,
buffer_tweets
))
logger
.
debug
(
"Page {} enqueued"
.
format
(
page_number
))
if
globals
.
timing
:
time1
=
time
.
time
()
logger
.
critical
(
"Time processing & buffering tweet page: {}s"
.
format
(
time1
-
time0
))
logger
.
debug
(
"Page {} enqueued"
.
format
(
page_number
))
if
globals
.
timing
:
time1
=
time
.
time
()
logger
.
critical
(
"Time processing & buffering tweet page: {}s"
.
format
(
time1
-
time0
))
except
Exception
:
raise
ExceptionAtPage
(
"Something failed while processing page"
,
page_number
)
def
get_tweet_output_path
(
tweet
:
dict
,
output_dir
:
str
)
->
str
:
...
...
@@ -206,13 +243,109 @@ def convert_tweet_to_csv(header: str, tweet: dict) -> str:
fields in CSV form
"""
flat_tweet
=
json2csv
.
flatten_dictionary
(
tweet
)
csv_tweet
=
json2csv
.
json2csv
(
flat_tweet
,
True
,
5
,
False
)
csv_appendable_tweet
=
format_csv
.
get_csv_line
(
header
,
csv_tweet
)
try
:
flat_tweet
=
json2csv
.
flatten_dictionary
(
tweet
)
except
Exception
:
raise
TweetConversionException
(
"Error when flattening tweet"
,
tweet
)
try
:
csv_tweet
=
json2csv
.
json2csv
(
flat_tweet
,
True
,
5
,
False
)
except
Exception
:
raise
TweetConversionException
(
"Error when trying to convert tweet to CSV"
,
flat_tweet
)
try
:
csv_appendable_tweet
=
format_csv
.
get_csv_line
(
header
,
csv_tweet
)
except
Exception
:
raise
TweetConversionException
(
"Error when formatting CSV tweet"
,
csv_tweet
)
return
csv_appendable_tweet
def
build_recovery_filepath
(
dbname
:
str
)
->
str
:
"""
Build the path of a recovery file
:param dbname: name of the database being queried
:returns: the path of the recovery file generated in this execution
"""
recovery_dir
=
"./recovery"
if
not
os
.
path
.
isdir
(
recovery_dir
):
os
.
mkdir
(
recovery_dir
)
now
=
datetime
.
datetime
.
now
()
datetime_str
=
"
%04
d
%02
d
%02
d-
%02
d
%02
d
%02
d"
%
\
(
now
.
year
,
now
.
month
,
now
.
day
,
now
.
hour
,
now
.
minute
,
now
.
second
)
recovery_file_path
=
os
.
path
.
join
(
recovery_dir
,
"recovery_"
+
dbname
+
"_"
+
datetime_str
+
".json"
)
return
recovery_file_path
def
create_recovery_file
(
file_path
:
str
,
host
:
str
,
port
:
int
,
database
:
str
,
page_size
:
int
)
\
->
None
:
"""
In case of error, dump information to file to allow recovery
:param host: address of the host to which the script connected
:param port: port of the Mongo database
:param database: name of the database being queried
:param page_size: size of the page that was being used
"""
recovery_file_contents
=
{}
recovery_file_contents
[
"host"
]
=
host
recovery_file_contents
[
"port"
]
=
port
recovery_file_contents
[
"database"
]
=
database
recovery_file_contents
[
"pagesize"
]
=
page_size
recovery_file_contents
[
"dumped_pages"
]
=
[]
with
open
(
file_path
,
"w"
)
as
f
:
json
.
dump
(
recovery_file_contents
,
f
)
logger
.
error
(
"Generated recovery file at {}"
.
format
(
file_path
))
def
update_recovery_file
(
file_path
:
str
,
page_number
:
int
)
\
->
None
:
"""
Add a new page to the list of already dumped pages in the recovery
file
:param file_path: path to the recovery file
:param page_number: number of the page that was safely written
"""
with
open
(
file_path
,
"r"
)
as
f
:
recovery_file_contents
=
json
.
load
(
f
)
recovery_file_contents
[
"dumped_pages"
]
.
append
(
page_number
)
with
open
(
file_path
,
"w"
)
as
f
:
json
.
dump
(
recovery_file_contents
,
f
)
def
dump_error_recovery_file
(
file_path
:
str
,
page_number
:
int
)
\
->
None
:
"""
Add information pointing to the page where error was detected
:param file_path: path to the recovery file
:param page_number: number of the page that crashed
"""
with
open
(
file_path
,
"r"
)
as
f
:
recovery_file_contents
=
json
.
load
(
f
)
recovery_file_contents
[
"error_page"
]
=
page_number
with
open
(
file_path
,
"w"
)
as
f
:
json
.
dump
(
recovery_file_contents
,
f
)
#########################
# TWEET DB PAGINATION #
#########################
...
...
@@ -292,3 +425,38 @@ def file_length(file_path: str) -> int:
for
i
,
l
in
enumerate
(
f
):
pass
return
i
#######################
# CUSTOM EXCEPTIONS #
#######################
class
ExceptionAtPage
(
Exception
):
"""
Exception designed to be raised when the conversion of a page of
tweets taken from Mongo fails
"""
def
__init__
(
self
,
message
:
str
,
error_page
:
int
):
"""
:param message: str descriptive of the error
:param error_page: int indicating the number of page that failed
"""
self
.
message
=
message
self
.
error_page
=
error_page
class
TweetConversionException
(
Exception
):
"""
Should be raised when a tweet raises an exception in the process of
being converted
"""
def
__init__
(
self
,
message
:
str
,
tweet
:
str
):
"""
:param message: str descriptive of the error
:param tweet: str with the contents of the tweet that caused the
failure
"""
self
.
message
=
message
self
.
tweet
=
tweet
pymongoexport_csv.py
View file @
888acbe2
...
...
@@ -2,9 +2,11 @@
import
pymongo
import
os
import
sys
import
argparse
import
logging
import
time
import
json
import
multiprocessing
as
mp
from
config
import
globals
from
lib
import
utils
...
...
@@ -17,6 +19,7 @@ parser.add_argument("-p", "--port", type=int, default=27017)
parser
.
add_argument
(
"-s"
,
"--pagesize"
,
type
=
int
,
default
=
1000
)
parser
.
add_argument
(
"-v"
,
"--verbose"
,
action
=
"store_true"
)
parser
.
add_argument
(
"-t"
,
"--timing"
,
action
=
"store_true"
)
parser
.
add_argument
(
"-r"
,
"--recovery"
,
type
=
str
)
parser
.
add_argument
(
"database"
,
type
=
str
)
args
=
parser
.
parse_args
()
...
...
@@ -44,13 +47,32 @@ if args.timing:
time0
=
time
.
time
()
# MongoDB connection to get page index
client
=
pymongo
.
MongoClient
(
args
.
host
,
args
.
port
)
database_tweets
=
client
[
args
.
database
][
"tweets"
]
page_index
=
utils
.
get_page_index
(
database_tweets
,
args
.
pagesize
)
client
.
close
()
logger
.
debug
(
"Database {} partitioned in {} pages of {} tweets (maximum)"
.
format
(
args
.
database
,
len
(
page_index
),
args
.
pagesize
))
if
args
.
recovery
:
with
open
(
args
.
recovery
)
as
f
:
recovery_data
=
json
.
load
(
f
)
client
=
pymongo
.
MongoClient
(
recovery_data
[
"host"
],
recovery_data
[
"port"
])
database_tweets
=
client
[
recovery_data
[
"database"
]][
"tweets"
]
full_page_index
=
utils
.
get_page_index
(
database_tweets
,
recovery_data
[
"pagesize"
])
client
.
close
()
page_index
=
[
page
for
page
in
full_page_index
if
page
not
in
recovery_data
[
"dumped_pages"
]]
if
"error_page"
in
recovery_data
:
logger
.
debug
(
"Discarding corrupted page"
)
page_index
.
remove
(
recovery_data
.
pop
(
"error_page"
))
logger
.
debug
(
"Resuming collection conversion. {} of {} pages left."
.
format
(
len
(
page_index
),
len
(
full_page_index
)))
else
:
client
=
pymongo
.
MongoClient
(
args
.
host
,
args
.
port
)
database_tweets
=
client
[
args
.
database
][
"tweets"
]
page_index
=
utils
.
get_page_index
(
database_tweets
,
args
.
pagesize
)
client
.
close
()
logger
.
debug
(
"Database {} partitioned in {} pages of {} tweets (maximum)"
.
format
(
args
.
database
,
len
(
page_index
),
args
.
pagesize
))
# Build a picklable function that we can pass to map
...
...
@@ -65,17 +87,31 @@ def process_data_page(
# Launch single process to write to the filesystem
writer_worker
=
mp
.
Process
(
target
=
utils
.
filesystem_writer
,
args
=
(
task_queue
,
header
,
))
target
=
utils
.
filesystem_writer
,
args
=
(
task_queue
,
header
,
args
.
host
,
args
.
port
,
args
.
database
,
args
.
pagesize
,
output_dir
,
args
.
recovery
))
writer_worker
.
start
()
# Launch pool of workers to perform the format conversion
with
mp
.
Pool
()
as
pool
:
pool
.
map
(
process_data_page
,
page_index
)
task_queue
.
put
(
"END"
)
try
:
with
mp
.
Pool
()
as
pool
:
pool
.
map
(
process_data_page
,
page_index
)
except
utils
.
ExceptionAtPage
as
exc
:
logger
.
error
(
"Error detected at page {}"
.
format
(
exc
.
error_page
))
task_queue
.
put
((
exc
.
error_page
,
"ERROR"
))
sys
.
exit
(
1
)
except
(
Exception
,
KeyboardInterrupt
):
logger
.
error
(
"Error detected"
)
task_queue
.
put
((
-
2
,
"ERROR"
))
sys
.
exit
(
1
)
task_queue
.
put
((
-
1
,
"END"
))
if
globals
.
timing
:
time1
=
time
.
time
()
utils
.
generate_metadata_file
(
output_dir
)
logger
.
info
(
"Metadata file created"
)
if
globals
.
timing
:
logger
.
critical
(
"Time spent generating metadata file: {}s"
...
...
@@ -83,3 +119,5 @@ if globals.timing:
logger
.
critical
(
"Total execution time: {}s"
.
format
(
time
.
time
()
-
time0
))
logger
.
info
(
"Conversion completed successfully!!"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment