Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
migration_scripts
Project
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Registry
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
serpucga
migration_scripts
Commits
888acbe2
Commit
888acbe2
authored
Jul 22, 2019
by
serpucga
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'feature/fault_tolerance' into develop
parents
36d7a65a
ab69fb73
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
51 additions
and
11 deletions
+51
-11
.gitignore
.gitignore
+2
-0
utils.py
lib/utils.py
+0
-0
pymongoexport_csv.py
pymongoexport_csv.py
+49
-11
No files found.
.gitignore
View file @
888acbe2
pymongodump
recovery
tests.py
.mypy_cache
.recovery*
lib/utils.py
View file @
888acbe2
This diff is collapsed.
Click to expand it.
pymongoexport_csv.py
View file @
888acbe2
...
...
@@ -2,9 +2,11 @@
import
pymongo
import
os
import
sys
import
argparse
import
logging
import
time
import
json
import
multiprocessing
as
mp
from
config
import
globals
from
lib
import
utils
...
...
@@ -17,6 +19,7 @@ parser.add_argument("-p", "--port", type=int, default=27017)
parser
.
add_argument
(
"-s"
,
"--pagesize"
,
type
=
int
,
default
=
1000
)
parser
.
add_argument
(
"-v"
,
"--verbose"
,
action
=
"store_true"
)
parser
.
add_argument
(
"-t"
,
"--timing"
,
action
=
"store_true"
)
parser
.
add_argument
(
"-r"
,
"--recovery"
,
type
=
str
)
parser
.
add_argument
(
"database"
,
type
=
str
)
args
=
parser
.
parse_args
()
...
...
@@ -44,13 +47,32 @@ if args.timing:
time0
=
time
.
time
()
# MongoDB connection to get page index
client
=
pymongo
.
MongoClient
(
args
.
host
,
args
.
port
)
database_tweets
=
client
[
args
.
database
][
"tweets"
]
page_index
=
utils
.
get_page_index
(
database_tweets
,
args
.
pagesize
)
client
.
close
()
logger
.
debug
(
"Database {} partitioned in {} pages of {} tweets (maximum)"
.
format
(
args
.
database
,
len
(
page_index
),
args
.
pagesize
))
if
args
.
recovery
:
with
open
(
args
.
recovery
)
as
f
:
recovery_data
=
json
.
load
(
f
)
client
=
pymongo
.
MongoClient
(
recovery_data
[
"host"
],
recovery_data
[
"port"
])
database_tweets
=
client
[
recovery_data
[
"database"
]][
"tweets"
]
full_page_index
=
utils
.
get_page_index
(
database_tweets
,
recovery_data
[
"pagesize"
])
client
.
close
()
page_index
=
[
page
for
page
in
full_page_index
if
page
not
in
recovery_data
[
"dumped_pages"
]]
if
"error_page"
in
recovery_data
:
logger
.
debug
(
"Discarding corrupted page"
)
page_index
.
remove
(
recovery_data
.
pop
(
"error_page"
))
logger
.
debug
(
"Resuming collection conversion. {} of {} pages left."
.
format
(
len
(
page_index
),
len
(
full_page_index
)))
else
:
client
=
pymongo
.
MongoClient
(
args
.
host
,
args
.
port
)
database_tweets
=
client
[
args
.
database
][
"tweets"
]
page_index
=
utils
.
get_page_index
(
database_tweets
,
args
.
pagesize
)
client
.
close
()
logger
.
debug
(
"Database {} partitioned in {} pages of {} tweets (maximum)"
.
format
(
args
.
database
,
len
(
page_index
),
args
.
pagesize
))
# Build a picklable function that we can pass to map
...
...
@@ -65,17 +87,31 @@ def process_data_page(
# Launch single process to write to the filesystem
writer_worker
=
mp
.
Process
(
target
=
utils
.
filesystem_writer
,
args
=
(
task_queue
,
header
,
))
target
=
utils
.
filesystem_writer
,
args
=
(
task_queue
,
header
,
args
.
host
,
args
.
port
,
args
.
database
,
args
.
pagesize
,
output_dir
,
args
.
recovery
))
writer_worker
.
start
()
# Launch pool of workers to perform the format conversion
with
mp
.
Pool
()
as
pool
:
pool
.
map
(
process_data_page
,
page_index
)
task_queue
.
put
(
"END"
)
try
:
with
mp
.
Pool
()
as
pool
:
pool
.
map
(
process_data_page
,
page_index
)
except
utils
.
ExceptionAtPage
as
exc
:
logger
.
error
(
"Error detected at page {}"
.
format
(
exc
.
error_page
))
task_queue
.
put
((
exc
.
error_page
,
"ERROR"
))
sys
.
exit
(
1
)
except
(
Exception
,
KeyboardInterrupt
):
logger
.
error
(
"Error detected"
)
task_queue
.
put
((
-
2
,
"ERROR"
))
sys
.
exit
(
1
)
task_queue
.
put
((
-
1
,
"END"
))
if
globals
.
timing
:
time1
=
time
.
time
()
utils
.
generate_metadata_file
(
output_dir
)
logger
.
info
(
"Metadata file created"
)
if
globals
.
timing
:
logger
.
critical
(
"Time spent generating metadata file: {}s"
...
...
@@ -83,3 +119,5 @@ if globals.timing:
logger
.
critical
(
"Total execution time: {}s"
.
format
(
time
.
time
()
-
time0
))
logger
.
info
(
"Conversion completed successfully!!"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment