Commit 064f7030 by Serbaf

Version 5.0

* Code from tweet_model.py now divided in two source files: tweet.py (containing the Tweet class) and utils.py (which contains the rest of the utilities to instantiate Tweet objects from CSVs, return a mini-dict with partial information of a Tweet, etc.) * Added two new fields to the Tweet class: "polarity" and "trtext", which are not part of the original Tweeter tweets, but needed in other project. * Added setter methods for "polarity" and "trtext", which will be probably now set at instantiation time but afterwards
parent b5cc4444
...@@ -66,3 +66,15 @@ contains certain symbols ...@@ -66,3 +66,15 @@ contains certain symbols
Version 0.4.7: Version 0.4.7:
Added function to generate dicts representing subsets of Tweet content (return Added function to generate dicts representing subsets of Tweet content (return
just the fields indicated by the user and not the full Tweet object) just the fields indicated by the user and not the full Tweet object)
0.5.0 (2019-05-03)
------------------
* Code from tweet_model.py now divided in two source files: tweet.py
(containing the Tweet class) and utils.py (which contains the rest of the
utilities to instantiate Tweet objects from CSVs, return a mini-dict with
partial information of a Tweet, etc.)
* Added two new fields to the Tweet class: "polarity" and "trtext", which are
not part of the original Tweeter tweets, but needed in other project.
* Added setter methods for "polarity" and "trtext", which will be probably now
set at instantiation time but afterwards
...@@ -51,6 +51,6 @@ setup( ...@@ -51,6 +51,6 @@ setup(
test_suite='tests', test_suite='tests',
tests_require=test_requirements, tests_require=test_requirements,
url='https://github.com/Serbaf/tweet_model', url='https://github.com/Serbaf/tweet_model',
version='0.4.7', version='0.5.0',
zip_safe=False, zip_safe=False,
) )
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
"""Main module.""" """Main module."""
import logging
from typing import Union, Dict, List, Generator
from tweet_manager.lib import format_csv
# Configure logger
LOG_FORMAT = '[%(asctime)-15s] %(levelname)s: %(message)s'
logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT)
logger = logging.getLogger("logger")
class Tweet(): class Tweet():
""" """
...@@ -138,7 +128,12 @@ class Tweet(): ...@@ -138,7 +128,12 @@ class Tweet():
extended_entities__media__source_status_id=None, extended_entities__media__source_status_id=None,
extended_entities__media__source_status_id_str=None, extended_entities__media__source_status_id_str=None,
extended_entities__media__type=None, extended_entities__media__type=None,
extended_entities__media__url=None): extended_entities__media__url=None,
# Additional fields (not from the Tweeter model)
polarity=None,
trtext=None
):
# Basic attributes # Basic attributes
self.created_at = created_at self.created_at = created_at
...@@ -360,121 +355,16 @@ class Tweet(): ...@@ -360,121 +355,16 @@ class Tweet():
extended_entities__media__type extended_entities__media__type
self.extended_entities["media"]["url"] = extended_entities__media__url self.extended_entities["media"]["url"] = extended_entities__media__url
def __getitem__(self, key): # Additional fields
return getattr(self, key) self.polarity = polarity
self.trtext = trtext
class NotValidTweetError(Exception):
pass
def get_tweet_from_csv_raw_line(header, line):
"""
Given a CSV header and a CSV line in raw format (strings with comma
separated values), extract the values for every field and then calls
get_tweet_from_csv_line to instance a Tweet.
Returns a Tweet object
"""
header_fields = format_csv.split_csv_line(header)
line_fields = format_csv.split_csv_line(line)
return get_tweet_from_csv_line(header_fields, line_fields)
def get_tweet_from_csv_line(header_fields, line_fields):
"""
Given the fields of a CSV line and header, the function instances a Tweet
object with all the non-empty attributes initialized to the values
indicated in the CSV entry.
Returns a Tweet object
"""
tweet_contents = {}
for i in range(len(line_fields)):
if line_fields[i] != '':
tweet_contents[header_fields[i].replace(".", "__")] =\
line_fields[i]
# try: # Setter methods
# tweet = Tweet(**tweet_contents) def set_polarity(self, polarity):
# except Exception as e: self.polarity = polarity
# print("An error of type " + type(e).__str__ + "ocurred")
# raise Exception
#
# return tweet
return Tweet(**tweet_contents)
def set_trtext(self, trtext):
self.trtext = trtext
def get_tweets_from_csv(csv_file): def __getitem__(self, key):
""" return getattr(self, key)
Take one argument: a path pointing to a valid CSV file.
The function reads the file, which should be a collection of tweets with a
header indicating the tweet fields (user.id, place.bounding_box.type,
etc.), and instances a new Tweet object for each of the lines in the CSV
file, assigning each value in the CSV to the corresponding Tweet attribute.
Returns a list of the Tweet objects instanced.
"""
tweets = []
with open(csv_file, 'r') as csv_object:
header = csv_object.readline()
body = csv_object.readlines()
header_fields = format_csv.split_csv_line(header)
# Check that the header contains valid fields
test_tweet = Tweet()
for field in header_fields:
field_components = field.split(".")
checking_dict = test_tweet.__dict__
error_string = ""
for component in field_components:
error_string += component
if (checking_dict is None) or (component not in checking_dict):
logger.error('The field in the header ' + error_string +
'is not a valid element of a Tweet')
raise NotValidTweetError("Header contains field which doesn't"
+ " belong to tweet specification: "
+ error_string)
checking_dict = checking_dict[component]
error_string += "."
# Go through every tweet in the file, instance it using the 'Tweet' class
# and add it to the list 'tweets'
for j in range(len(body)):
line_fields = format_csv.split_csv_line(body[j])
tweets.append(get_tweet_from_csv_line(header_fields, line_fields))
return tweets
def get_tweet_fields_subset(
tweet: Tweet,
fields: List[str]
) -> Dict:
"""
Given a Tweet objects, keep just the specified fields and return a dict
with just the information specified
"""
tweet_subset = {}
for field in fields:
try:
tweet_subset[field] = tweet[field]
except AttributeError:
pass
return tweet_subset
def get_tweet_collection_fields_subset(
tweet_collection: Union[List[Tweet], Generator[Tweet, None, None]],
fields: List[str]
) -> Generator[Dict, None, None]:
"""
Given a list of Tweet objects, keep just the specified fields and
return a generator of dicts with just the information specified
"""
for tweet in tweet_collection:
yield get_tweet_fields_subset(tweet, fields)
import logging
from typing import Union, Dict, List, Generator
from tweet import Tweet
from tweet_manager.lib import format_csv
# Configure logger
LOG_FORMAT = '[%(asctime)-15s] %(levelname)s: %(message)s'
logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT)
logger = logging.getLogger("logger")
def get_tweet_from_csv_raw_line(header, line):
"""
Given a CSV header and a CSV line in raw format (strings with comma
separated values), extract the values for every field and then calls
get_tweet_from_csv_line to instance a Tweet.
Returns a Tweet object
"""
header_fields = format_csv.split_csv_line(header)
line_fields = format_csv.split_csv_line(line)
return get_tweet_from_csv_line(header_fields, line_fields)
def get_tweet_from_csv_line(header_fields, line_fields):
"""
Given the fields of a CSV line and header, the function instances a Tweet
object with all the non-empty attributes initialized to the values
indicated in the CSV entry.
Returns a Tweet object
"""
tweet_contents = {}
for i in range(len(line_fields)):
if line_fields[i] != '':
tweet_contents[header_fields[i].replace(".", "__")] =\
line_fields[i]
# try:
# tweet = Tweet(**tweet_contents)
# except Exception as e:
# print("An error of type " + type(e).__str__ + "ocurred")
# raise Exception
#
# return tweet
return Tweet(**tweet_contents)
def get_tweets_from_csv(csv_file):
"""
Take one argument: a path pointing to a valid CSV file.
The function reads the file, which should be a collection of tweets with a
header indicating the tweet fields (user.id, place.bounding_box.type,
etc.), and instances a new Tweet object for each of the lines in the CSV
file, assigning each value in the CSV to the corresponding Tweet attribute.
Returns a list of the Tweet objects instanced.
"""
tweets = []
with open(csv_file, 'r') as csv_object:
header = csv_object.readline()
body = csv_object.readlines()
header_fields = format_csv.split_csv_line(header)
# Check that the header contains valid fields
test_tweet = Tweet()
for field in header_fields:
field_components = field.split(".")
checking_dict = test_tweet.__dict__
error_string = ""
for component in field_components:
error_string += component
if (checking_dict is None) or (component not in checking_dict):
logger.error('The field in the header ' + error_string +
'is not a valid element of a Tweet')
raise NotValidTweetError("Header contains field which doesn't"
+ " belong to tweet specification: "
+ error_string)
checking_dict = checking_dict[component]
error_string += "."
# Go through every tweet in the file, instance it using the 'Tweet' class
# and add it to the list 'tweets'
for j in range(len(body)):
line_fields = format_csv.split_csv_line(body[j])
tweets.append(get_tweet_from_csv_line(header_fields, line_fields))
return tweets
def get_tweet_fields_subset(
tweet: Tweet,
fields: List[str]
) -> Dict:
"""
Given a Tweet objects, keep just the specified fields and return a dict
with just the information specified
"""
tweet_subset = {}
for field in fields:
try:
tweet_subset[field] = tweet[field]
except AttributeError:
pass
return tweet_subset
def get_tweet_collection_fields_subset(
tweet_collection: Union[List[Tweet], Generator[Tweet, None, None]],
fields: List[str]
) -> Generator[Dict, None, None]:
"""
Given a list of Tweet objects, keep just the specified fields and
return a generator of dicts with just the information specified
"""
for tweet in tweet_collection:
yield get_tweet_fields_subset(tweet, fields)
class NotValidTweetError(Exception):
pass
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment