Commit 064f7030 by Serbaf

Version 5.0

* Code from tweet_model.py now divided in two source files: tweet.py (containing the Tweet class) and utils.py (which contains the rest of the utilities to instantiate Tweet objects from CSVs, return a mini-dict with partial information of a Tweet, etc.) * Added two new fields to the Tweet class: "polarity" and "trtext", which are not part of the original Tweeter tweets, but needed in other project. * Added setter methods for "polarity" and "trtext", which will be probably now set at instantiation time but afterwards
parent b5cc4444
......@@ -66,3 +66,15 @@ contains certain symbols
Version 0.4.7:
Added function to generate dicts representing subsets of Tweet content (return
just the fields indicated by the user and not the full Tweet object)
0.5.0 (2019-05-03)
------------------
* Code from tweet_model.py now divided in two source files: tweet.py
(containing the Tweet class) and utils.py (which contains the rest of the
utilities to instantiate Tweet objects from CSVs, return a mini-dict with
partial information of a Tweet, etc.)
* Added two new fields to the Tweet class: "polarity" and "trtext", which are
not part of the original Tweeter tweets, but needed in other project.
* Added setter methods for "polarity" and "trtext", which will be probably now
set at instantiation time but afterwards
......@@ -51,6 +51,6 @@ setup(
test_suite='tests',
tests_require=test_requirements,
url='https://github.com/Serbaf/tweet_model',
version='0.4.7',
version='0.5.0',
zip_safe=False,
)
# -*- coding: utf-8 -*-
"""Main module."""
import logging
from typing import Union, Dict, List, Generator
from tweet_manager.lib import format_csv
# Configure logger
LOG_FORMAT = '[%(asctime)-15s] %(levelname)s: %(message)s'
logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT)
logger = logging.getLogger("logger")
class Tweet():
"""
......@@ -138,7 +128,12 @@ class Tweet():
extended_entities__media__source_status_id=None,
extended_entities__media__source_status_id_str=None,
extended_entities__media__type=None,
extended_entities__media__url=None):
extended_entities__media__url=None,
# Additional fields (not from the Tweeter model)
polarity=None,
trtext=None
):
# Basic attributes
self.created_at = created_at
......@@ -360,121 +355,16 @@ class Tweet():
extended_entities__media__type
self.extended_entities["media"]["url"] = extended_entities__media__url
def __getitem__(self, key):
return getattr(self, key)
class NotValidTweetError(Exception):
pass
def get_tweet_from_csv_raw_line(header, line):
"""
Given a CSV header and a CSV line in raw format (strings with comma
separated values), extract the values for every field and then calls
get_tweet_from_csv_line to instance a Tweet.
Returns a Tweet object
"""
header_fields = format_csv.split_csv_line(header)
line_fields = format_csv.split_csv_line(line)
return get_tweet_from_csv_line(header_fields, line_fields)
def get_tweet_from_csv_line(header_fields, line_fields):
"""
Given the fields of a CSV line and header, the function instances a Tweet
object with all the non-empty attributes initialized to the values
indicated in the CSV entry.
Returns a Tweet object
"""
tweet_contents = {}
for i in range(len(line_fields)):
if line_fields[i] != '':
tweet_contents[header_fields[i].replace(".", "__")] =\
line_fields[i]
# Additional fields
self.polarity = polarity
self.trtext = trtext
# try:
# tweet = Tweet(**tweet_contents)
# except Exception as e:
# print("An error of type " + type(e).__str__ + "ocurred")
# raise Exception
#
# return tweet
return Tweet(**tweet_contents)
# Setter methods
def set_polarity(self, polarity):
self.polarity = polarity
def set_trtext(self, trtext):
self.trtext = trtext
def get_tweets_from_csv(csv_file):
"""
Take one argument: a path pointing to a valid CSV file.
The function reads the file, which should be a collection of tweets with a
header indicating the tweet fields (user.id, place.bounding_box.type,
etc.), and instances a new Tweet object for each of the lines in the CSV
file, assigning each value in the CSV to the corresponding Tweet attribute.
Returns a list of the Tweet objects instanced.
"""
tweets = []
with open(csv_file, 'r') as csv_object:
header = csv_object.readline()
body = csv_object.readlines()
header_fields = format_csv.split_csv_line(header)
# Check that the header contains valid fields
test_tweet = Tweet()
for field in header_fields:
field_components = field.split(".")
checking_dict = test_tweet.__dict__
error_string = ""
for component in field_components:
error_string += component
if (checking_dict is None) or (component not in checking_dict):
logger.error('The field in the header ' + error_string +
'is not a valid element of a Tweet')
raise NotValidTweetError("Header contains field which doesn't"
+ " belong to tweet specification: "
+ error_string)
checking_dict = checking_dict[component]
error_string += "."
# Go through every tweet in the file, instance it using the 'Tweet' class
# and add it to the list 'tweets'
for j in range(len(body)):
line_fields = format_csv.split_csv_line(body[j])
tweets.append(get_tweet_from_csv_line(header_fields, line_fields))
return tweets
def get_tweet_fields_subset(
tweet: Tweet,
fields: List[str]
) -> Dict:
"""
Given a Tweet objects, keep just the specified fields and return a dict
with just the information specified
"""
tweet_subset = {}
for field in fields:
try:
tweet_subset[field] = tweet[field]
except AttributeError:
pass
return tweet_subset
def get_tweet_collection_fields_subset(
tweet_collection: Union[List[Tweet], Generator[Tweet, None, None]],
fields: List[str]
) -> Generator[Dict, None, None]:
"""
Given a list of Tweet objects, keep just the specified fields and
return a generator of dicts with just the information specified
"""
for tweet in tweet_collection:
yield get_tweet_fields_subset(tweet, fields)
def __getitem__(self, key):
return getattr(self, key)
import logging
from typing import Union, Dict, List, Generator
from tweet import Tweet
from tweet_manager.lib import format_csv
# Configure logger
LOG_FORMAT = '[%(asctime)-15s] %(levelname)s: %(message)s'
logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT)
logger = logging.getLogger("logger")
def get_tweet_from_csv_raw_line(header, line):
"""
Given a CSV header and a CSV line in raw format (strings with comma
separated values), extract the values for every field and then calls
get_tweet_from_csv_line to instance a Tweet.
Returns a Tweet object
"""
header_fields = format_csv.split_csv_line(header)
line_fields = format_csv.split_csv_line(line)
return get_tweet_from_csv_line(header_fields, line_fields)
def get_tweet_from_csv_line(header_fields, line_fields):
"""
Given the fields of a CSV line and header, the function instances a Tweet
object with all the non-empty attributes initialized to the values
indicated in the CSV entry.
Returns a Tweet object
"""
tweet_contents = {}
for i in range(len(line_fields)):
if line_fields[i] != '':
tweet_contents[header_fields[i].replace(".", "__")] =\
line_fields[i]
# try:
# tweet = Tweet(**tweet_contents)
# except Exception as e:
# print("An error of type " + type(e).__str__ + "ocurred")
# raise Exception
#
# return tweet
return Tweet(**tweet_contents)
def get_tweets_from_csv(csv_file):
"""
Take one argument: a path pointing to a valid CSV file.
The function reads the file, which should be a collection of tweets with a
header indicating the tweet fields (user.id, place.bounding_box.type,
etc.), and instances a new Tweet object for each of the lines in the CSV
file, assigning each value in the CSV to the corresponding Tweet attribute.
Returns a list of the Tweet objects instanced.
"""
tweets = []
with open(csv_file, 'r') as csv_object:
header = csv_object.readline()
body = csv_object.readlines()
header_fields = format_csv.split_csv_line(header)
# Check that the header contains valid fields
test_tweet = Tweet()
for field in header_fields:
field_components = field.split(".")
checking_dict = test_tweet.__dict__
error_string = ""
for component in field_components:
error_string += component
if (checking_dict is None) or (component not in checking_dict):
logger.error('The field in the header ' + error_string +
'is not a valid element of a Tweet')
raise NotValidTweetError("Header contains field which doesn't"
+ " belong to tweet specification: "
+ error_string)
checking_dict = checking_dict[component]
error_string += "."
# Go through every tweet in the file, instance it using the 'Tweet' class
# and add it to the list 'tweets'
for j in range(len(body)):
line_fields = format_csv.split_csv_line(body[j])
tweets.append(get_tweet_from_csv_line(header_fields, line_fields))
return tweets
def get_tweet_fields_subset(
tweet: Tweet,
fields: List[str]
) -> Dict:
"""
Given a Tweet objects, keep just the specified fields and return a dict
with just the information specified
"""
tweet_subset = {}
for field in fields:
try:
tweet_subset[field] = tweet[field]
except AttributeError:
pass
return tweet_subset
def get_tweet_collection_fields_subset(
tweet_collection: Union[List[Tweet], Generator[Tweet, None, None]],
fields: List[str]
) -> Generator[Dict, None, None]:
"""
Given a list of Tweet objects, keep just the specified fields and
return a generator of dicts with just the information specified
"""
for tweet in tweet_collection:
yield get_tweet_fields_subset(tweet, fields)
class NotValidTweetError(Exception):
pass
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment