#Import the necessary libraries needed for the project.

# Data analysis libraries
import pandas as pd
import numpy as np

# Visualizaton tools 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Networking libraries
import requests
import tweepy
from tweepy import OAuthHandler
from timeit import default_timer as timer

# For parsing json
import json


# Since our data will contain long strings, make sure we display entire strings of dataframe
pd.set_option('display.max_colwidth', -1)

<ipython-input-2-0481607fbd9a>:2: FutureWarning: Passing a negative integer is deprecated in version 1.0 and will not be supported in future version. Instead, use None to not limit the column width.
  pd.set_option('display.max_colwidth', -1)


# Create dataframe of WeRateDogs twitter archive and view the first few records.
df_twitter_archive = pd.read_csv('twitter_archive_enhanced.csv')
df_twitter_archive.head()


# Set the url to the file's location
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'

# Request the url and save the response
response = requests.get(url)

# If there is not an error, write the file to our directory
if response.status_code == 200:
    with open('image_predictions.tsv', 'wb') as file:
        file.write(response.content)


# Load the predictions into a Pandas dataframe.
df_image_predictions =  pd.read_csv('image_predictions.tsv', sep='\t')
df_image_predictions.head()


'''
# Keys, secrets, etc. are hidden to comply with Twitter's API terms and conditions
consumer_key = 'HIDDEN'
consumer_secret = 'HIDDEN'
access_token = 'HIDDEN'
access_secret = 'HIDDEN'

auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True)

# NOTE TO REVIEWER: this student had mobile verification issues so the following
# Twitter API code was sent to this student from a Udacity instructor

# Tweet IDs for which to gather additional data via Twitter's API
tweet_ids = df_twitter_archive.tweet_id.values
len(tweet_ids)

# Query Twitter's API for JSON data for each tweet ID in the Twitter archive
count = 0
fails_dict = {}
start = timer()
# Save each tweet's returned JSON as a new line in a .txt file
with open('tweet_json.txt', 'w') as outfile:
    # This loop will likely take 20-30 minutes to run because of Twitter's rate limit
    for tweet_id in tweet_ids:
        count += 1
        print(str(count) + ": " + str(tweet_id))
        try:
            tweet = api.get_status(tweet_id, tweet_mode='extended')
            print("Success")
            json.dump(tweet._json, outfile)
            outfile.write('\n')
        except tweepy.TweepError as e:
            print("Fail")
            fails_dict[tweet_id] = e
            pass
end = timer()
print(end - start)
print(fails_dict)
'''

'\n# Keys, secrets, etc. are hidden to comply with Twitter\'s API terms and conditions\nconsumer_key = \'HIDDEN\'\nconsumer_secret = \'HIDDEN\'\naccess_token = \'HIDDEN\'\naccess_secret = \'HIDDEN\'\n\nauth = OAuthHandler(consumer_key, consumer_secret)\nauth.set_access_token(access_token, access_secret)\n\napi = tweepy.API(auth, wait_on_rate_limit=True)\n\n# NOTE TO REVIEWER: this student had mobile verification issues so the following\n# Twitter API code was sent to this student from a Udacity instructor\n\n# Tweet IDs for which to gather additional data via Twitter\'s API\ntweet_ids = df_twitter_archive.tweet_id.values\nlen(tweet_ids)\n\n# Query Twitter\'s API for JSON data for each tweet ID in the Twitter archive\ncount = 0\nfails_dict = {}\nstart = timer()\n# Save each tweet\'s returned JSON as a new line in a .txt file\nwith open(\'tweet_json.txt\', \'w\') as outfile:\n    # This loop will likely take 20-30 minutes to run because of Twitter\'s rate limit\n    for tweet_id in tweet_ids:\n        count += 1\n        print(str(count) + ": " + str(tweet_id))\n        try:\n            tweet = api.get_status(tweet_id, tweet_mode=\'extended\')\n            print("Success")\n            json.dump(tweet._json, outfile)\n            outfile.write(\'\n\')\n        except tweepy.TweepError as e:\n            print("Fail")\n            fails_dict[tweet_id] = e\n            pass\nend = timer()\nprint(end - start)\nprint(fails_dict)\n'


# Create our empty pandas dataframe (with column names only)
df_twitter_api_data = pd.DataFrame(columns = ['tweet_id', 'retweet_count', 'favorite_count'])

with open('tweet_json.txt', 'r') as file:

    # Read first line of tweet data
    tweet = file.readline()

    # Loop through successive tweets until no more left
    while (tweet):
        
        # Load json tweet data
        tweet_data = json.loads(tweet)

        # Extract and append the tweet data to our dataframe
        df_twitter_api_data = df_twitter_api_data.append({'tweet_id': tweet_data['id'], 
                          'retweet_count': tweet_data['retweet_count'], 
                          'favorite_count': tweet_data['favorite_count']
                         }, ignore_index=True)

        # Read in next tweet before going through the loop again
        tweet = file.readline()

# Print out the first few records of our dataframe
df_twitter_api_data.head()


df_twitter_archive


df_twitter_archive['source'].value_counts()

<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>     2221
<a href="http://vine.co" rel="nofollow">Vine - Make a Scene</a>                        91  
<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>                     33  
<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>    11  
Name: source, dtype: int64


df_twitter_archive['name'].value_counts()

None       745
a          55 
Charlie    12 
Lucy       11 
Cooper     11 
           .. 
Harlso     1  
Tedrick    1  
Jareld     1  
Fynn       1  
Laika      1  
Name: name, Length: 957, dtype: int64


df_twitter_archive[df_twitter_archive['name'].str.get(0).str.islower()]['name'].value_counts()

a               55
the             8 
an              7 
very            5 
quite           4 
just            4 
one             4 
mad             2 
actually        2 
getting         2 
not             2 
by              1 
unacceptable    1 
light           1 
all             1 
my              1 
old             1 
his             1 
such            1 
life            1 
space           1 
infuriating     1 
officially      1 
incredibly      1 
this            1 
Name: name, dtype: int64


df_twitter_archive[(df_twitter_archive['name'] == 'A') | (df_twitter_archive['name'] == 'The') 
                   | (df_twitter_archive['name'] == 'An')]['name'].count()

0


df_twitter_archive[df_twitter_archive['name'].str.get(0).str.isupper()]['name'].value_counts()

None       745
Charlie    12 
Lucy       11 
Cooper     11 
Oliver     11 
           .. 
Harlso     1  
Tedrick    1  
Jareld     1  
Fynn       1  
Laika      1  
Name: name, Length: 932, dtype: int64


df_twitter_archive[df_twitter_archive['rating_denominator'] != 10][['text', 'rating_numerator', 'rating_denominator']]


# Let's create a temporary dataframe that contains only rows that have at least one non-"None" value
check_dup_stages = df_twitter_archive[(df_twitter_archive['doggo']!='None') |
                  (df_twitter_archive['floofer']!='None') |
                  (df_twitter_archive['pupper']!='None') |
                   (df_twitter_archive['puppo']!='None')][['tweet_id', 'doggo', 'floofer', 'pupper', 'puppo']]


# Next, in order to be able to sum the number of non-None values per row, let's map
# them to numeric values. 0 for "None" and 1 for each of the column names
check_dup_stages.doggo = check_dup_stages.doggo.map({'None':0, 'doggo':1})
check_dup_stages.floofer = check_dup_stages.floofer.map({'None':0, 'floofer':1})
check_dup_stages.pupper = check_dup_stages.pupper.map({'None':0, 'pupper':1})
check_dup_stages.puppo = check_dup_stages.puppo.map({'None':0, 'puppo':1})


# Let's sum up our rows
check_dup_stages['stages_sum'] = check_dup_stages.doggo + check_dup_stages.floofer \
                                + check_dup_stages.pupper + check_dup_stages.puppo


check_dup_stages['stages_sum'].value_counts()

1    366
2    14 
Name: stages_sum, dtype: int64


df_twitter_archive['doggo'].value_counts()

None     2259
doggo    97  
Name: doggo, dtype: int64


df_twitter_archive['floofer'].value_counts()

None       2346
floofer    10  
Name: floofer, dtype: int64


df_twitter_archive['pupper'].value_counts()

None      2099
pupper    257 
Name: pupper, dtype: int64


df_twitter_archive['puppo'].value_counts()

None     2326
puppo    30  
Name: puppo, dtype: int64


df_twitter_archive.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 non-null   object 
 14  floofer                     2356 non-null   object 
 15  pupper                      2356 non-null   object 
 16  puppo                       2356 non-null   object 
dtypes: float64(4), int64(3), object(10)
memory usage: 313.0+ KB


df_twitter_archive[df_twitter_archive['expanded_urls'].isna()]


df_twitter_archive[(df_twitter_archive['tweet_id'] == 828361771580813312) 
                   | (df_twitter_archive['tweet_id'] == 785515384317313025) 
                   | (df_twitter_archive['tweet_id'] == 696518437233913856)]


print('Duplicates: {}'.format(sum(df_twitter_archive.duplicated())))

Duplicates: 0


df_image_predictions


sum(df_image_predictions['tweet_id'].duplicated())

0


df_image_predictions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


print('Rows in df_twitter_archive: {}'.format(df_twitter_archive.shape[0]))
print('Rows in df_image_predictions: {}'.format(df_image_predictions.shape[0]))

Rows in df_twitter_archive: 2356
Rows in df_image_predictions: 2075


print('Replies to remove: {}'.format(df_twitter_archive[~df_twitter_archive['in_reply_to_status_id'].isna()].tweet_id.count()))
print('Retweets to remove: {}'.format(df_twitter_archive[~df_twitter_archive['retweeted_status_id'].isna()].tweet_id.count()))

Replies to remove: 78
Retweets to remove: 181


df_twitter_api_data


df_twitter_api_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2354 entries, 0 to 2353
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tweet_id        2354 non-null   object
 1   retweet_count   2354 non-null   object
 2   favorite_count  2354 non-null   object
dtypes: object(3)
memory usage: 55.3+ KB


print('Duplicates: {}'.format(sum(df_twitter_api_data.duplicated())))

Duplicates: 0


df_archive_clean = df_twitter_archive.copy()
df_image_predictions_clean = df_image_predictions.copy()
df_api_data_clean = df_twitter_api_data.copy()


# Remove all replies
df_archive_clean = df_archive_clean[df_archive_clean['in_reply_to_status_id'].isnull()]


# Remove all retweets
df_archive_clean = df_archive_clean[df_archive_clean['retweeted_status_id'].isnull()]


# Verify that there are no non-null values in the 'in_reply_to_status_id' or 'in_reply_to_user_id' columns
# As these are the two columns associated with replies, we should see '0' returned
df_archive_clean[(~df_archive_clean['in_reply_to_status_id'].isnull()) | (~df_archive_clean['in_reply_to_user_id'].isnull())]['tweet_id'].count()

0


# Verify that there are no non-null values in the in 'retweeted_status_id', 'retweeted_status_user_id', 
# or 'retweeted_status_timestamp' columns
# As these are the two columns associated with retweets, we should see '0' returned
df_archive_clean[(~df_archive_clean['retweeted_status_id'].isnull()) | (~df_archive_clean['retweeted_status_user_id'].isnull()) | (~df_archive_clean['retweeted_status_timestamp'].isnull())]['tweet_id'].count()

0


# Remove rows with null in 'expanded_urls' column
df_archive_clean = df_archive_clean[~df_archive_clean['expanded_urls'].isnull()]


# Verify that the number of rows with null in 'expanded_urls' column is 0
df_archive_clean[df_archive_clean['expanded_urls'].isnull()]['tweet_id'].count()

0


# Let's do a quick count on the number of remaining rows in the archive. It should be 2094, as calculated in the 
# Assess section
df_archive_clean.shape[0]

2094


# Drop the columns and save the result
df_archive_clean = df_archive_clean.drop(['in_reply_to_status_id', 
                                          'in_reply_to_user_id', 
                                          'retweeted_status_id', 
                                          'retweeted_status_user_id', 
                                          'retweeted_status_timestamp'], axis=1)


# Verify that the above columns are not present in the list of returned columns
df_archive_clean.columns

Index(['tweet_id', 'timestamp', 'source', 'text', 'expanded_urls',
       'rating_numerator', 'rating_denominator', 'name', 'doggo', 'floofer',
       'pupper', 'puppo'],
      dtype='object')


# Get count of columns with at least one dog stage
df_archive_clean[(df_archive_clean['doggo']!='None') |
                  (df_archive_clean['floofer']!='None') |
                  (df_archive_clean['pupper']!='None') |
                   (df_archive_clean['puppo']!='None')].tweet_id.count()

335


# Columns used for id_vars parameter in the melt will be all but the last 4 in our archive
id_cols = df_archive_clean.columns[:-4].values.tolist()
id_cols

['tweet_id',
 'timestamp',
 'source',
 'text',
 'expanded_urls',
 'rating_numerator',
 'rating_denominator',
 'name']


# Columns used in the value_vars parameter in melt will be the last 4 in our archive 
# (i.e. doggo, floofer, pupper, and puppo)
value_cols = df_archive_clean.columns[8:].values.tolist()
value_cols

['doggo', 'floofer', 'pupper', 'puppo']


# Perform the melt. This will create a row for each tweet for each of the columns specified in the value_vars
# The "value" of that original column will be placed in the "dog_stage" column (specified in the value_name paramenter)
df_archive_clean = pd.melt(df_archive_clean, id_vars = id_cols, value_vars = value_cols, value_name='dog_stage')


# We won't need the variable column, so let's drop it
df_archive_clean = df_archive_clean.drop(['variable'], axis=1)


# Sort on the tweet_id and dog_stage column
df_archive_clean = df_archive_clean.sort_values(by=['dog_stage'])


# Let's drop the unnecessary rows for each tweet id (our subset), use keep='last' to make sure
# that any non-None values are chosen.
df_archive_clean = df_archive_clean.drop_duplicates(subset='tweet_id', keep='last')


# Verify that we haven't lost any rows. This should return 2094.
df_archive_clean.shape[0]

2094


# Verify that we have the same number of rows that contain a non-None dog_stage value that we did above,
# i.e. this should return 335.
df_archive_clean[df_archive_clean['dog_stage']!='None'].tweet_id.count()

335


# Create a set of the tweet ids in our archive
archive_tweet_ids = set(df_archive_clean['tweet_id'].tolist())

# Create a set of the tweet ids in our image predictions
prediction_tweet_ids = set(df_image_predictions_clean['tweet_id'].tolist())

# Get the set of ids that are in the archive that are not in the image predictions file
missing_tweet_ids = archive_tweet_ids.difference(prediction_tweet_ids)

# Print the total number of id's that will be present in our merged sets
len(archive_tweet_ids) - len(missing_tweet_ids)

1971


# Merge the two dataframes
df_archive_clean = df_archive_clean.merge(df_image_predictions_clean, how='inner', on='tweet_id')


# Verify visually that the newly merged data is present
# Specifically that the jpg_url, img_num, p1, p1_conf, p1_dog, p2, p2_conf, p2_dog, p3, p3_conf, and p3_dog columns are present
df_archive_clean.head()


# The total number of rows in the archive should now match what we calculated above
df_archive_clean.shape[0]

1971


# Convert the tweet_id column of the api data to an integer
df_api_data_clean['tweet_id'] = df_api_data_clean['tweet_id'].astype(int)


# Create a set of the tweet ids in our archive
archive_tweet_ids = set(df_archive_clean['tweet_id'].tolist())

# Create a set of the tweet ids in our api data
api_tweet_ids = set(df_api_data_clean['tweet_id'].tolist())

# Get the set of ids that are in the archive that are not in the image predictions file
missing_tweet_ids = archive_tweet_ids.difference(api_tweet_ids)

# Print the total number of id's that will be present in our merged sets
len(archive_tweet_ids) - len(missing_tweet_ids)

1971


# Merge the two dataframes
df_archive_clean = df_archive_clean.merge(df_api_data_clean, how='inner', on='tweet_id')


# Verify that the newly merged data is present, specifically the retweet_count and favorite_count columns
df_archive_clean.head()


# The total number of rows in the archive should be equal to the count tallied above
df_archive_clean.shape[0]

1971


# Convert to datetime
df_archive_clean['timestamp'] = pd.to_datetime(df_archive_clean['timestamp'])


# Verify that timestamp has datetime data type
df_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1971 entries, 0 to 1970
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   tweet_id            1971 non-null   int64              
 1   timestamp           1971 non-null   datetime64[ns, UTC]
 2   source              1971 non-null   object             
 3   text                1971 non-null   object             
 4   expanded_urls       1971 non-null   object             
 5   rating_numerator    1971 non-null   int64              
 6   rating_denominator  1971 non-null   int64              
 7   name                1971 non-null   object             
 8   dog_stage           1971 non-null   object             
 9   jpg_url             1971 non-null   object             
 10  img_num             1971 non-null   int64              
 11  p1                  1971 non-null   object             
 12  p1_conf             1971 non-null   float64            
 13  p1_dog              1971 non-null   bool               
 14  p2                  1971 non-null   object             
 15  p2_conf             1971 non-null   float64            
 16  p2_dog              1971 non-null   bool               
 17  p3                  1971 non-null   object             
 18  p3_conf             1971 non-null   float64            
 19  p3_dog              1971 non-null   bool               
 20  retweet_count       1971 non-null   object             
 21  favorite_count      1971 non-null   object             
dtypes: bool(3), datetime64[ns, UTC](1), float64(3), int64(4), object(11)
memory usage: 313.7+ KB


# Convert datatypes to int
df_archive_clean['retweet_count'] = df_archive_clean['retweet_count'].astype(int)
df_archive_clean['favorite_count'] = df_archive_clean['favorite_count'].astype(int)


# Verify that both columns are now of data type integer
df_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1971 entries, 0 to 1970
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   tweet_id            1971 non-null   int64              
 1   timestamp           1971 non-null   datetime64[ns, UTC]
 2   source              1971 non-null   object             
 3   text                1971 non-null   object             
 4   expanded_urls       1971 non-null   object             
 5   rating_numerator    1971 non-null   int64              
 6   rating_denominator  1971 non-null   int64              
 7   name                1971 non-null   object             
 8   dog_stage           1971 non-null   object             
 9   jpg_url             1971 non-null   object             
 10  img_num             1971 non-null   int64              
 11  p1                  1971 non-null   object             
 12  p1_conf             1971 non-null   float64            
 13  p1_dog              1971 non-null   bool               
 14  p2                  1971 non-null   object             
 15  p2_conf             1971 non-null   float64            
 16  p2_dog              1971 non-null   bool               
 17  p3                  1971 non-null   object             
 18  p3_conf             1971 non-null   float64            
 19  p3_dog              1971 non-null   bool               
 20  retweet_count       1971 non-null   int64              
 21  favorite_count      1971 non-null   int64              
dtypes: bool(3), datetime64[ns, UTC](1), float64(3), int64(6), object(9)
memory usage: 313.7+ KB


# Clean the source column by mapping links to appropriate values
df_archive_clean['source'] = df_archive_clean['source'].map({'<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>':'Twitter for iPhone', 
                                                             '<a href="http://vine.co" rel="nofollow">Vine - Make a Scene</a>':'Vine',
                                                            '<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>':'Twitter Web Client',
                                                            '<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>':'TweetDeck'})


# Verify that the long hyperlinks have been replaced with the app names
df_archive_clean['source'].value_counts()

Twitter for iPhone    1932
Twitter Web Client    28  
TweetDeck             11  
Name: source, dtype: int64


# Get value counts in name column to demonstrate that "None" and lowercase words are present
df_archive_clean['name'].value_counts()

None       524
a          55 
Charlie    11 
Oliver     10 
Cooper     10 
           .. 
Birf       1  
Davey      1  
Adele      1  
Cheryl     1  
Laika      1  
Name: name, Length: 935, dtype: int64


# Get a list of all of the lower case words
names_to_clean = df_archive_clean[df_archive_clean['name'].str.get(0).str.islower()]['name'].value_counts().index.tolist()

# Append "None" to the list
names_to_clean.append('None')

# Replace all occurances of the words in the list to np.nan
df_archive_clean['name'].replace(names_to_clean, np.nan, inplace = True)


# Get value counts in name column to demonstrate that "None" and lowercase words are no longer present
df_archive_clean['name'].value_counts()

Charlie      11
Cooper       10
Lucy         10
Oliver       10
Tucker       9 
            .. 
Odin         1 
Pupcasso     1 
Kathmandu    1 
Tedrick      1 
Laika        1 
Name: name, Length: 912, dtype: int64


# Loop through the list of names we deemed for cleaning and see if it is found in the archive
# Should return false for each name in the list
for name in names_to_clean:
    print(name in df_archive_clean['name'])

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


# Get a list of the tweets that have a rating with a denominator higher than 10.
df_archive_clean[df_archive_clean['rating_denominator'] != 10][['text', 'rating_numerator', 'rating_denominator']]


# Remove the tweet at index 1331
df_archive_clean = df_archive_clean.drop(df_archive_clean.index[1331])


# Correct the rating of the tweet with index 100
df_archive_clean.at[100, 'rating_numerator'] = 9
df_archive_clean.at[100, 'rating_denominator'] = 10


# Correct the rating of the tweet with index 314
df_archive_clean.at[314, 'rating_numerator'] = 10
df_archive_clean.at[314, 'rating_denominator'] = 10


# Correct the rating of the tweet with index 455
df_archive_clean.at[455, 'rating_numerator'] = 13
df_archive_clean.at[455, 'rating_denominator'] = 10


# Correct the rating of the tweet with index 476
df_archive_clean.at[476, 'rating_numerator'] = 11
df_archive_clean.at[476, 'rating_denominator'] = 10


# Correct the rating of the tweet with index 1146
df_archive_clean.at[1146, 'rating_numerator'] = 14
df_archive_clean.at[1146, 'rating_denominator'] = 10


# Verify that the tweet with index 1331 is no longer in our archive
# By checking that it's not present between the range of indexes that precede and follow it
df_archive_clean.iloc[1330:1332]


# Verify that the tweet at index 100 has the correct rating
# Should be 9/10
print(df_archive_clean.iloc[100]['rating_numerator'], df_archive_clean.iloc[100]['rating_denominator'], sep='/')

9/10


# Verify that the tweet at index 314 has the correct rating
# Should be 10/10
print(df_archive_clean.iloc[314]['rating_numerator'], df_archive_clean.iloc[314]['rating_denominator'], sep='/')

10/10


# Verify that the tweet at index 455 has the correct rating
# Should be 13/10
print(df_archive_clean.iloc[455]['rating_numerator'], df_archive_clean.iloc[455]['rating_denominator'], sep='/')

13/10


# Verify that the tweet at index 476 has the correct rating
# Should be 11/10
print(df_archive_clean.iloc[476]['rating_numerator'], df_archive_clean.iloc[476]['rating_denominator'], sep='/')

11/10


# Verify that the tweet at index 1146 has the correct rating
# Should be 14/10
print(df_archive_clean.iloc[1146]['rating_numerator'], df_archive_clean.iloc[1146]['rating_denominator'], sep='/')

14/10


# Remove underscores from columns
df_archive_clean['p1'].replace('_', ' ', regex=True, inplace = True)
df_archive_clean['p2'].replace('_', ' ', regex=True, inplace = True)
df_archive_clean['p3'].replace('_', ' ', regex=True, inplace = True)


# Convert all p1, p2, p3 strings to lowercase
df_archive_clean['p1'] = df_archive_clean['p1'].str.lower()
df_archive_clean['p2'] = df_archive_clean['p2'].str.lower()
df_archive_clean['p3'] = df_archive_clean['p3'].str.lower()


# Verify that underscores and uppercase no longer present in p1 values
df_archive_clean['p1'].value_counts()

golden retriever      136
labrador retriever    94 
pembroke              88 
chihuahua             78 
pug                   54 
                      .. 
grey fox              1  
gas pump              1  
teapot                1  
restaurant            1  
shield                1  
Name: p1, Length: 373, dtype: int64


# Verify that underscores and uppercase no longer present in p2 values
df_archive_clean['p2'].value_counts()

labrador retriever          95
golden retriever            82
cardigan                    73
chihuahua                   43
chesapeake bay retriever    40
                            ..
peacock                     1 
sandbar                     1 
spindle                     1 
african hunting dog         1 
sea urchin                  1 
Name: p2, Length: 396, dtype: int64


# Verify that underscores and uppercase no longer present in p3 values
df_archive_clean['p3'].value_counts()

labrador retriever    75
chihuahua             57
golden retriever      45
eskimo dog            36
kelpie                33
                      ..
parallel bars         1 
red wolf              1 
ibex                  1 
toyshop               1 
sunglass              1 
Name: p3, Length: 403, dtype: int64


# Save a final clean copy of our dataframe
df_archive_final = df_archive_clean.copy()


# Store a copy of our final archive in a CSV file. No additional files are necessary.
df_archive_final.to_csv('twitter_archive_master.csv')


# Calculate average of all ratings in the set
ave_rating_all = sum(df_archive_final['rating_numerator'])/sum(df_archive_final['rating_denominator'])
# Format numerator for printing
ave_denominator = round(ave_rating_all * 10, 2)
print('Ave. rating per dog: {}/10'.format(ave_denominator))

Ave. rating per dog: 11.67/10


# Calculate the average one dog rating
ave_one_dog = round(df_archive_final[df_archive_final['rating_denominator'] == 10]['rating_numerator'].mean(), 2)
print('Ave. one dog rating: {}/10'.format(ave_one_dog))

Ave. one dog rating: 11.7/10


# Get dataset of tweets with rating_denominator greater 10, i.e. more than one dog is being rated
df_multiple_dog_tweets = df_archive_final[df_archive_final['rating_denominator'] > 10]

# Get the average rating of this set
ave_mult_dog = sum(df_multiple_dog_tweets['rating_numerator'])/sum(df_multiple_dog_tweets['rating_denominator'])

# Format numerator for printing
ave_mult_dog_numerator = round(ave_mult_dog * 10, 2)
print('Ave. per dog in multi dog tweet: {}/10'.format(ave_mult_dog_numerator))

Ave. per dog in multi dog tweet: 11.19/10


df_multiple_dog_tweets['tweet_id'].count()

12


# Create a distribution of all of our ratings normalized on a scale of 10
ave_ratings = (df_archive_final['rating_numerator']/df_archive_final['rating_denominator'] * 10).astype(int)
# Check value counts for the rating column
ave_ratings.value_counts()

12      449
10      419
11      398
13      254
9       151
8       96 
7       51 
14      34 
5       33 
6       32 
3       19 
4       15 
2       9  
1       4  
75      1  
420     1  
0       1  
27      1  
26      1  
1776    1  
dtype: int64


# Set graph size
sns.set(rc={'figure.figsize':(11, 4)})

# Plot the distribution
ax = ave_ratings.plot(kind='hist', title = 'Ratings Distribution', bins=np.arange(min(ave_ratings), max(ave_ratings) + 1, 1));
# Ignore outliers
ax.set_xlim(0, 15)
ax.set_ylabel("# of Tweets")
ax.set_xlabel("Rating")

Text(0.5, 0, 'Rating')


# Create a new dataframe with timestamp and rating (which is a normalized rating on a scale out of 10)
df_ratings = df_archive_final[['timestamp', 'rating_numerator', 'rating_denominator' ]]
df_ratings['rating'] = (df_ratings['rating_numerator']/df_ratings['rating_denominator'] * 10).astype(int)
# Drop columns that are no longer needed
df_ratings = df_ratings[['timestamp','rating']]
df_ratings.set_index('timestamp');

<ipython-input-99-ecc4e7e53d96>:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ratings['rating'] = (df_ratings['rating_numerator']/df_ratings['rating_denominator'] * 10).astype(int)


# Preliminary plot
ax = df_ratings['rating'].plot(title='Ratings Trend over Time', linewidth=1);
ax.set_xlabel("Timestamp")
ax.set_ylabel("Rating (out of 10)")

Text(0, 0.5, 'Rating (out of 10)')


# Remove outliers
df_ratings = df_ratings[~((df_ratings['rating'] == 1776) | (df_ratings['rating'] == 420) | (df_ratings['rating'] == 0))]


# Plot our average rating by month over time
ax = df_ratings.resample('M', on='timestamp').mean()['rating'].plot(title='Ratings Trend over Time', linewidth=1);
ax.set_xlabel("Month")
ax.set_ylabel("Rating (out of 10)")

Text(0, 0.5, 'Rating (out of 10)')


# Create new set of tweets with 1st prediction is a dog with at least 50% confidence
df_predicted_dogs = df_archive_final[(df_archive_final['p1_dog'] == True) & (df_archive_final['p1_conf'] >= 0.5)]

# Double check the size of our set
print('The size of our set is: {}'.format(df_predicted_dogs['tweet_id'].count()))

# Normalize our ratings to be on a scale of 10
df_predicted_dogs['rating']= (df_predicted_dogs['rating_numerator']/df_predicted_dogs['rating_denominator'] * 10).astype(int)

# Keep only columns relevant for our analysis
df_predicted_dogs = df_predicted_dogs[['tweet_id','p1', 'rating']]

# Print first few records
df_predicted_dogs.head()

The size of our set is: 928

<ipython-input-103-54ba5b635bfa>:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_predicted_dogs['rating']= (df_predicted_dogs['rating_numerator']/df_predicted_dogs['rating_denominator'] * 10).astype(int)


df_predicted_dogs.groupby('p1').rating.mean().sort_values(ascending=False)

p1
clumber                   27.0
afghan hound              13.0
saluki                    13.0
curly-coated retriever    13.0
briard                    12.5
                          ... 
boston bull               8.4 
walker hound              8.0 
tibetan terrier           7.0 
japanese spaniel          5.0 
miniature schnauzer       5.0 
Name: rating, Length: 99, dtype: float64


# Plot the top results of average rating by predicted dog breed.
ax = df_predicted_dogs.groupby('p1').rating.mean().sort_values(ascending=False)[10::-1].plot(kind='barh', title='Highest Rated Predicted Breeds on Average');
ax.set_ylabel("Predicted Breed")
ax.set_xlabel("Ave. Rating")

Text(0.5, 0, 'Ave. Rating')


print('# of clumbers: {}'.format(df_predicted_dogs[df_predicted_dogs['p1'] == 'clumber']['tweet_id'].count()))

# of clumbers: 1


# Get a list of the top ten breeds with highest average ratings
top_rated_breeds = df_predicted_dogs.groupby('p1').rating.mean().sort_values(ascending=False)[0:10].index.tolist()

# Print the total number of this type of breed in our set
for breed in top_rated_breeds:
    print('# of {}s: {}'.format(breed, df_predicted_dogs[df_predicted_dogs['p1'] == breed]['tweet_id'].count()))

# of clumbers: 1
# of afghan hounds: 1
# of salukis: 3
# of curly-coated retrievers: 1
# of briards: 2
# of tibetan mastiffs: 3
# of blueticks: 1
# of toy terriers: 1
# of irish setters: 2
# of border terriers: 3


df_predicted_dogs['p1'].value_counts()

golden retriever          115
pembroke                  70 
labrador retriever        64 
chihuahua                 46 
pug                       43 
                          .. 
groenendael               1  
miniature schnauzer       1  
welsh springer spaniel    1  
irish water spaniel       1  
japanese spaniel          1  
Name: p1, Length: 99, dtype: int64


# Get a list of the top ten predicted breeds
top_predicted_breeds = df_predicted_dogs['p1'].value_counts()[0:10].index.tolist()

# Create a dataframe of these breeds and their ratings
df_common_breeds_ratings = df_predicted_dogs[df_predicted_dogs['p1'].isin(top_predicted_breeds)]

# Print the ratings for the top 10 predicted breeds in descending order
df_common_breeds_ratings.groupby('p1').rating.mean().sort_values(ascending=False)

p1
samoyed               11.733333
chow                  11.730769
pembroke              11.657143
golden retriever      11.556522
french bulldog        11.227273
labrador retriever    11.218750
pomeranian            11.172414
toy poodle            11.130435
chihuahua             10.565217
pug                   10.209302
Name: rating, dtype: float64


# Plot the most predicted breeds and their average ratings
ax = df_common_breeds_ratings.groupby('p1').rating.mean().sort_values(ascending=False)[10::-1].plot(kind='barh', title='Most Predicted Breeds & Ave. Ratings');
ax.set_ylabel("Predicted Breed")
ax.set_xlabel("Ave. Rating")

Text(0.5, 0, 'Ave. Rating')


# Create new set of tweets with 1st prediction is a dog with at least 50% confidence
df_popular_dogs = df_archive_final[(df_archive_final['p1_dog'] == True) & (df_archive_final['p1_conf'] >= 0.5)]

# Double check the size of our set
print('The size of our set is: {}'.format(df_popular_dogs['tweet_id'].count()))

# If popularity can be measured by retweets and favorites, let's some these and create a new column
df_popular_dogs['popularity'] = df_popular_dogs['retweet_count'] + df_popular_dogs['favorite_count']

# Keep only columns relevant for our analysis
df_popular_dogs = df_popular_dogs[['tweet_id','p1', 'popularity', 'retweet_count', 'favorite_count']]

# Print first few records
df_popular_dogs.head()

The size of our set is: 928

<ipython-input-111-9e4a97e732bd>:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_popular_dogs['popularity'] = df_popular_dogs['retweet_count'] + df_popular_dogs['favorite_count']


df_popular_dogs.groupby('p1').popularity.mean().sort_values(ascending=False)

p1
saluki                 37440.000000
irish water spaniel    31264.000000
eskimo dog             24974.555556
french bulldog         23033.318182
afghan hound           22617.000000
                           ...     
miniature schnauzer    1112.000000 
english springer       1068.000000 
redbone                836.000000  
giant schnauzer        332.000000  
tibetan terrier        269.000000  
Name: popularity, Length: 99, dtype: float64


# Plot the top results of average popularity by predicted dog breeds
ax = df_popular_dogs.groupby('p1').popularity.mean().sort_values(ascending=False)[10::-1].plot(kind='barh', title='Most Popular Predicted Breeds on Average');
ax.set_ylabel("Predicted Breed")
ax.set_xlabel("# of Retweets & Favorites")

Text(0.5, 0, '# of Retweets & Favorites')

	tweet_id	jpg_url	img_num	p1	p1_conf	p1_dog	p2	p2_conf	p2_dog	p3	p3_conf	p3_dog
0	666020888022790149	https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg	1	Welsh_springer_spaniel	0.465074	True	collie	0.156665	True	Shetland_sheepdog	0.061428	True
1	666029285002620928	https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg	1	redbone	0.506826	True	miniature_pinscher	0.074192	True	Rhodesian_ridgeback	0.072010	True
2	666033412701032449	https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg	1	German_shepherd	0.596461	True	malinois	0.138584	True	bloodhound	0.116197	True
3	666044226329800704	https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg	1	Rhodesian_ridgeback	0.408143	True	redbone	0.360687	True	miniature_pinscher	0.222752	True
4	666049248165822465	https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg	1	miniature_pinscher	0.560311	True	Rottweiler	0.243682	True	Doberman	0.154629	True

	tweet_id	retweet_count	favorite_count
0	892420643555336193	8853	39467
1	892177421306343426	6514	33819
2	891815181378084864	4328	25461
3	891689557279858688	8964	42908
4	891327558926688256	9774	41048

	text	rating_numerator	rating_denominator
313	@jonnysun @Lin_Manuel ok jomny I know you're excited but 960/00 isn't a valid rating, 13/10 is tho	960	0
342	@docmisterio account started on 11/15/15	11	15
433	The floofs have been released I repeat the floofs have been released. 84/70 https://t.co/NIYC820tmd	84	70
516	Meet Sam. She smiles 24/7 & secretly aspires to be a reindeer. \nKeep Sam smiling by clicking and sharing this link:\nhttps://t.co/98tB8y7y7t https://t.co/LouL5vdvxx	24	7
784	RT @dog_rates: After so many requests, this is Bretagne. She was the last surviving 9/11 search dog, and our second ever 14/10. RIP https:/…	9	11
902	Why does this never happen at my front door... 165/150 https://t.co/HmwrdfEfUE	165	150
1068	After so many requests, this is Bretagne. She was the last surviving 9/11 search dog, and our second ever 14/10. RIP https://t.co/XAVDNDaVgQ	9	11
1120	Say hello to this unbelievably well behaved squad of doggos. 204/170 would try to pet all at once https://t.co/yGQI3He3xv	204	170
1165	Happy 4/20 from the squad! 13/10 for all https://t.co/eV1diwds8a	4	20
1202	This is Bluebert. He just saw that both #FinalFur match ups are split 50/50. Amazed af. 11/10 https://t.co/Kky1DPG4iq	50	50
1228	Happy Saturday here's 9 puppers on a bench. 99/90 good work everybody https://t.co/mpvaVxKmc1	99	90
1254	Here's a brigade of puppers. All look very prepared for whatever happens next. 80/80 https://t.co/0eb7R1Om12	80	80
1274	From left to right:\nCletus, Jerome, Alejandro, Burp, & Titson\nNone know where camera is. 45/50 would hug all at once https://t.co/sedre1ivTK	45	50
1351	Here is a whole flock of puppers. 60/50 I'll take the lot https://t.co/9dpcw6MdWa	60	50
1433	Happy Wednesday here's a bucket of pups. 44/40 would pet all at once https://t.co/HppvrYuamZ	44	40
1598	Yes I do realize a rating of 4/20 would've been fitting. However, it would be unjust to give these cooperative pups that low of a rating	4	20
1634	Two sneaky puppers were not initially seen, moving the rating to 143/130. Please forgive us. Thank you https://t.co/kRK51Y5ac3	143	130
1635	Someone help the girl is being mugged. Several are distracting her while two steal her shoes. Clever puppers 121/110 https://t.co/1zfnTJLt55	121	110
1662	This is Darrel. He just robbed a 7/11 and is in a high speed police chase. Was just spotted by the helicopter 10/10 https://t.co/7EsP8LmSp5	7	11
1663	I'm aware that I could've said 20/16, but here at WeRateDogs we are very professional. An inconsistent rating scale is simply irresponsible	20	16
1779	IT'S PUPPERGEDDON. Total of 144/120 ...I think https://t.co/ZanVtAtvIq	144	120
1843	Here we have an entire platoon of puppers. Total score: 88/80 would pet all at once https://t.co/y93p6FLvVw	88	80
2335	This is an Albanian 3 1/2 legged Episcopalian. Loves well-polished hardwood flooring. Penis on the collar. 9/10 https://t.co/d9NcXFKwLv	1	2

	tweet_id	in_reply_to_status_id	in_reply_to_user_id	timestamp	source	text	retweeted_status_id	retweeted_status_user_id	retweeted_status_timestamp	expanded_urls	rating_numerator	rating_denominator	name	doggo	floofer	pupper	puppo
30	886267009285017600	8.862664e+17	2.281182e+09	2017-07-15 16:51:35 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	@NonWhiteHat @MayhewMayhem omg hello tanner you are a scary good boy 12/10 would pet with extreme caution	NaN	NaN	NaN	NaN	12	10	None	None	None	None	None
55	881633300179243008	8.816070e+17	4.738443e+07	2017-07-02 21:58:53 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	@roushfenway These are good dogs but 17/10 is an emotional impulse rating. More like 13/10s	NaN	NaN	NaN	NaN	17	10	None	None	None	None	None
64	879674319642796034	8.795538e+17	3.105441e+09	2017-06-27 12:14:36 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	@RealKentMurphy 14/10 confirmed	NaN	NaN	NaN	NaN	14	10	None	None	None	None	None
113	870726314365509632	8.707262e+17	1.648776e+07	2017-06-02 19:38:25 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	@ComplicitOwl @ShopWeRateDogs >10/10 is reserved for dogs	NaN	NaN	NaN	NaN	10	10	None	None	None	None	None
148	863427515083354112	8.634256e+17	7.759620e+07	2017-05-13 16:15:35 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	@Jack_Septic_Eye I'd need a few more pics to polish a full analysis, but based on the good boy content above I'm leaning towards 12/10	NaN	NaN	NaN	NaN	12	10	None	None	None	None	None
179	857214891891077121	8.571567e+17	1.806710e+08	2017-04-26 12:48:51 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	@Marc_IRL pixelated af 12/10	NaN	NaN	NaN	NaN	12	10	None	None	None	None	None
185	856330835276025856	NaN	NaN	2017-04-24 02:15:55 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	RT @Jenna_Marbles: @dog_rates Thanks for rating my cermets 14/10 wow I'm so proud I watered them so much	8.563302e+17	66699013.0	2017-04-24 02:13:14 +0000	NaN	14	10	None	None	None	None	None
186	856288084350160898	8.562860e+17	2.792810e+08	2017-04-23 23:26:03 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	@xianmcguire @Jenna_Marbles Kardashians wouldn't be famous if as a society we didn't place enormous value on what they do. The dogs are very deserving of their 14/10	NaN	NaN	NaN	NaN	14	10	None	None	None	None	None
188	855862651834028034	8.558616e+17	1.943518e+08	2017-04-22 19:15:32 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	@dhmontgomery We also gave snoop dogg a 420/10 but I think that predated your research	NaN	NaN	NaN	NaN	420	10	None	None	None	None	None
189	855860136149123072	8.558585e+17	1.361572e+07	2017-04-22 19:05:32 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	@s8n You tried very hard to portray this good boy as not so good, but you have ultimately failed. His goodness shines through. 666/10	NaN	NaN	NaN	NaN	666	10	None	None	None	None	None
218	850333567704068097	8.503288e+17	2.195506e+07	2017-04-07 13:04:55 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	@markhoppus MARK THAT DOG HAS SEEN AND EXPERIENCED MANY THINGS. PROBABLY LOST OTHER EAR DOING SOMETHING HEROIC. 13/10 HUG THE DOG HOPPUS	NaN	NaN	NaN	NaN	13	10	None	None	None	None	None
228	848213670039564288	8.482121e+17	4.196984e+09	2017-04-01 16:41:12 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	Jerry just apuppologized to me. He said there was no ill-intent to the slippage. I overreacted I admit. Pupgraded to an 11/10 would pet	NaN	NaN	NaN	NaN	11	10	None	None	None	None	None
234	847617282490613760	8.476062e+17	4.196984e+09	2017-03-31 01:11:22 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	.@breaannanicolee PUPDATE: Cannon has a heart on his nose. Pupgraded to a 13/10	NaN	NaN	NaN	NaN	13	10	None	None	None	None	None
274	840698636975636481	8.406983e+17	8.405479e+17	2017-03-11 22:59:09 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	@0_kelvin_0 >10/10 is reserved for puppos sorry Kevin	NaN	NaN	NaN	NaN	10	10	None	None	None	None	None
290	838150277551247360	8.381455e+17	2.195506e+07	2017-03-04 22:12:52 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	@markhoppus 182/10	NaN	NaN	NaN	NaN	182	10	None	None	None	None	None
291	838085839343206401	8.380855e+17	2.894131e+09	2017-03-04 17:56:49 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	@bragg6of8 @Andy_Pace_ we are still looking for the first 15/10	NaN	NaN	NaN	NaN	15	10	None	None	None	None	None
313	835246439529840640	8.352460e+17	2.625958e+07	2017-02-24 21:54:03 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	@jonnysun @Lin_Manuel ok jomny I know you're excited but 960/00 isn't a valid rating, 13/10 is tho	NaN	NaN	NaN	NaN	960	0	None	None	None	None	None
342	832088576586297345	8.320875e+17	3.058208e+07	2017-02-16 04:45:50 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	@docmisterio account started on 11/15/15	NaN	NaN	NaN	NaN	11	15	None	None	None	None	None
346	831926988323639298	8.319030e+17	2.068372e+07	2017-02-15 18:03:45 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	@UNC can confirm 12/10	NaN	NaN	NaN	NaN	12	10	None	None	None	None	None
375	828361771580813312	NaN	NaN	2017-02-05 21:56:51 +0000	<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>	Beebop and Doobert should start a band 12/10 would listen	NaN	NaN	NaN	NaN	12	10	None	None	None	None	None
387	826598799820865537	8.265984e+17	4.196984e+09	2017-02-01 01:11:25 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	I was going to do 007/10, but the joke wasn't worth the <10 rating	NaN	NaN	NaN	NaN	7	10	None	None	None	None	None
409	823333489516937216	8.233264e+17	1.582854e+09	2017-01-23 00:56:15 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	@HistoryInPics 13/10	NaN	NaN	NaN	NaN	13	10	None	None	None	None	None
427	821153421864615936	8.211526e+17	1.132119e+08	2017-01-17 00:33:26 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	@imgur for a polar bear tho I'd say 13/10 is appropriate	NaN	NaN	NaN	NaN	13	10	None	None	None	None	None
498	813130366689148928	8.131273e+17	4.196984e+09	2016-12-25 21:12:41 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	I've been informed by multiple sources that this is actually a dog elf who's tired from helping Santa all night. Pupgraded to 12/10	NaN	NaN	NaN	NaN	12	10	None	None	None	None	None
513	811647686436880384	8.116272e+17	4.196984e+09	2016-12-21 19:01:02 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	PUPDATE: I've been informed that Augie was actually bringing his family these flowers when he tripped. Very good boy. Pupgraded to 11/10	NaN	NaN	NaN	NaN	11	10	None	None	None	None	None
570	801854953262350336	8.018543e+17	1.185634e+07	2016-11-24 18:28:13 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	.@NBCSports OMG THE TINY HAT I'M GOING TO HAVE TO SAY 11/10 NBC	NaN	NaN	NaN	NaN	11	10	None	None	None	None	None
576	800859414831898624	8.008580e+17	2.918590e+08	2016-11-22 00:32:18 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	@SkyWilliams doggo simply protecting you from evil that which you cannot see. 11/10 would give extra pets	NaN	NaN	NaN	NaN	11	10	None	doggo	None	None	None
611	797165961484890113	7.971238e+17	2.916630e+07	2016-11-11 19:55:50 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	@JODYHiGHROLLER it may be an 11/10 but what do I know 😉	NaN	NaN	NaN	NaN	11	10	None	None	None	None	None
701	786051337297522688	7.727430e+17	7.305050e+17	2016-10-12 03:50:17 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	13/10 for breakdancing puppo @shibbnbot	NaN	NaN	NaN	NaN	13	10	None	None	None	None	puppo
707	785515384317313025	NaN	NaN	2016-10-10 16:20:36 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	Today, 10/10, should be National Dog Rates Day	NaN	NaN	NaN	NaN	10	10	None	None	None	None	None
843	766714921925144576	7.667118e+17	4.196984e+09	2016-08-19 19:14:16 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	His name is Charley and he already has a new set of wheels thanks to donations. I heard his top speed was also increased. 13/10 for Charley	NaN	NaN	NaN	NaN	13	10	None	None	None	None	None
857	763956972077010945	7.638652e+17	1.584641e+07	2016-08-12 04:35:10 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	@TheEllenShow I'm not sure if you know this but that doggo right there is a 12/10	NaN	NaN	NaN	NaN	12	10	None	doggo	None	None	None
967	750381685133418496	7.501805e+17	4.717297e+09	2016-07-05 17:31:49 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	13/10 such a good doggo\n@spaghemily	NaN	NaN	NaN	NaN	13	10	None	doggo	None	None	None
1005	747651430853525504	7.476487e+17	4.196984e+09	2016-06-28 04:42:46 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	Other pupper asked not to have his identity shared. Probably just embarrassed about the headbutt. Also 12/10 it'll be ok mystery pup	NaN	NaN	NaN	NaN	12	10	None	None	None	pupper	None
1080	738891149612572673	7.384119e+17	3.589728e+08	2016-06-04 00:32:32 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	@mount_alex3 13/10	NaN	NaN	NaN	NaN	13	10	None	None	None	None	None
1295	707983188426153984	7.079801e+17	2.319108e+09	2016-03-10 17:35:20 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	@serial @MrRoles OH MY GOD I listened to all of season 1 during a single road trip. I love you guys! I can confirm Bernie's 12/10 rating :)	NaN	NaN	NaN	NaN	12	10	None	None	None	None	None
1345	704491224099647488	7.044857e+17	2.878549e+07	2016-03-01 02:19:31 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	13/10 hero af\n@ABC	NaN	NaN	NaN	NaN	13	10	None	None	None	None	None
1445	696518437233913856	NaN	NaN	2016-02-08 02:18:30 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	Oh my god 10/10 for every little hot dog pupper	NaN	NaN	NaN	NaN	10	10	None	None	None	pupper	None
1446	696490539101908992	6.964887e+17	4.196984e+09	2016-02-08 00:27:39 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	After reading the comments I may have overestimated this pup. Downgraded to a 1/10. Please forgive me	NaN	NaN	NaN	NaN	1	10	None	None	None	None	None
1474	693644216740769793	6.936422e+17	4.196984e+09	2016-01-31 03:57:23 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	BREAKING PUPDATE: I've just been notified that (if in U.S.) this dog appears to be operating the vehicle. Upgraded to 10/10. Skilled af	NaN	NaN	NaN	NaN	10	10	None	None	None	None	None
1479	693582294167244802	6.935722e+17	1.198989e+09	2016-01-30 23:51:19 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	Personally I'd give him an 11/10. Not sure why you think you're qualified to rate such a stellar pup.\n@CommonWhiteGirI	NaN	NaN	NaN	NaN	11	10	None	None	None	None	None
1497	692423280028966913	6.924173e+17	4.196984e+09	2016-01-27 19:05:49 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	PUPDATE: just noticed this dog has some extra legs. Very advanced. Revolutionary af. Upgraded to a 9/10	NaN	NaN	NaN	NaN	9	10	None	None	None	None	None
1523	690607260360429569	6.903413e+17	4.670367e+08	2016-01-22 18:49:36 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	12/10 @LightningHoltt	NaN	NaN	NaN	NaN	12	10	None	None	None	None	None
1598	686035780142297088	6.860340e+17	4.196984e+09	2016-01-10 04:04:10 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	Yes I do realize a rating of 4/20 would've been fitting. However, it would be unjust to give these cooperative pups that low of a rating	NaN	NaN	NaN	NaN	4	20	None	None	None	None	None
1605	685681090388975616	6.855479e+17	4.196984e+09	2016-01-09 04:34:45 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	Jack deserves another round of applause. If you missed this earlier today I strongly suggest reading it. Wonderful first 14/10 🐶❤️	NaN	NaN	NaN	NaN	14	10	None	None	None	None	None
1618	684969860808454144	6.849598e+17	4.196984e+09	2016-01-07 05:28:35 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	For those who claim this is a goat, u are wrong. It is not the Greatest Of All Time. The rating of 5/10 should have made that clear. Thank u	NaN	NaN	NaN	NaN	5	10	None	None	None	None	None
1663	682808988178739200	6.827884e+17	4.196984e+09	2016-01-01 06:22:03 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	I'm aware that I could've said 20/16, but here at WeRateDogs we are very professional. An inconsistent rating scale is simply irresponsible	NaN	NaN	NaN	NaN	20	16	None	None	None	None	None
1689	681340665377193984	6.813394e+17	4.196984e+09	2015-12-28 05:07:27 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	I've been told there's a slight possibility he's checking his mirror. We'll bump to 9.5/10. Still a menace	NaN	NaN	NaN	NaN	5	10	None	None	None	None	None
1774	678023323247357953	6.780211e+17	4.196984e+09	2015-12-19 01:25:31 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	After getting lost in Reese's eyes for several minutes we're going to upgrade him to a 13/10	NaN	NaN	NaN	NaN	13	10	None	None	None	None	None
1819	676590572941893632	6.765883e+17	4.196984e+09	2015-12-15 02:32:17 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	After some outrage from the crowd. Bubbles is being upgraded to a 7/10. That's as high as I'm going. Thank you	NaN	NaN	NaN	NaN	7	10	None	None	None	None	None
1844	675849018447167488	6.758457e+17	4.196984e+09	2015-12-13 01:25:37 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	This dog is being demoted to a 9/10 for not wearing a helmet while riding. Gotta stay safe out there. Thank you	NaN	NaN	NaN	NaN	9	10	None	None	None	None	None
1895	674742531037511680	6.747400e+17	4.196984e+09	2015-12-10 00:08:50 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	Some clarification is required. The dog is singing Cher and that is more than worthy of an 11/10. Thank you	NaN	NaN	NaN	NaN	11	10	None	None	None	None	None
1905	674606911342424069	6.744689e+17	4.196984e+09	2015-12-09 15:09:55 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	The 13/10 also takes into account this impeccable yard. Louis is great but the future dad in me can't ignore that luscious green grass	NaN	NaN	NaN	NaN	13	10	None	None	None	None	None
1914	674330906434379776	6.658147e+17	1.637468e+07	2015-12-08 20:53:11 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	13/10\n@ABC7	NaN	NaN	NaN	NaN	13	10	None	None	None	None	None
1940	673716320723169284	6.737159e+17	4.196984e+09	2015-12-07 04:11:02 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	The millennials have spoken and we've decided to immediately demote to a 1/10. Thank you	NaN	NaN	NaN	NaN	1	10	None	None	None	None	None
2038	671550332464455680	6.715449e+17	4.196984e+09	2015-12-01 04:44:10 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	After 22 minutes of careful deliberation this dog is being demoted to a 1/10. The longer you look at him the more terrifying he becomes	NaN	NaN	NaN	NaN	1	10	None	None	None	None	None
2149	669684865554620416	6.693544e+17	4.196984e+09	2015-11-26 01:11:28 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	After countless hours of research and hundreds of formula alterations we have concluded that Dug should be bumped to an 11/10	NaN	NaN	NaN	NaN	11	10	None	None	None	None	None
2189	668967877119254528	6.689207e+17	2.143566e+07	2015-11-24 01:42:25 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	12/10 good shit Bubka\n@wane15	NaN	NaN	NaN	NaN	12	10	None	None	None	None	None
2298	667070482143944705	6.670655e+17	4.196984e+09	2015-11-18 20:02:51 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	After much debate this dog is being upgraded to 10/10. I repeat 10/10	NaN	NaN	NaN	NaN	10	10	None	None	None	None	None

	tweet_id	jpg_url	img_num	p1	p1_conf	p1_dog	p2	p2_conf	p2_dog	p3	p3_conf	p3_dog
0	666020888022790149	https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg	1	Welsh_springer_spaniel	0.465074	True	collie	0.156665	True	Shetland_sheepdog	0.061428	True
1	666029285002620928	https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg	1	redbone	0.506826	True	miniature_pinscher	0.074192	True	Rhodesian_ridgeback	0.072010	True
2	666033412701032449	https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg	1	German_shepherd	0.596461	True	malinois	0.138584	True	bloodhound	0.116197	True
3	666044226329800704	https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg	1	Rhodesian_ridgeback	0.408143	True	redbone	0.360687	True	miniature_pinscher	0.222752	True
4	666049248165822465	https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg	1	miniature_pinscher	0.560311	True	Rottweiler	0.243682	True	Doberman	0.154629	True
...	...	...	...	...	...	...	...	...	...	...	...	...
2070	891327558926688256	https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg	2	basset	0.555712	True	English_springer	0.225770	True	German_short-haired_pointer	0.175219	True
2071	891689557279858688	https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg	1	paper_towel	0.170278	False	Labrador_retriever	0.168086	True	spatula	0.040836	False
2072	891815181378084864	https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg	1	Chihuahua	0.716012	True	malamute	0.078253	True	kelpie	0.031379	True
2073	892177421306343426	https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg	1	Chihuahua	0.323581	True	Pekinese	0.090647	True	papillon	0.068957	True
2074	892420643555336193	https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg	1	orange	0.097049	False	bagel	0.085851	False	banana	0.076110	False

	tweet_id	in_reply_to_status_id	in_reply_to_user_id	timestamp	source	text	retweeted_status_id	retweeted_status_user_id	retweeted_status_timestamp	expanded_urls	rating_numerator	rating_denominator	name	doggo	floofer	pupper	puppo
0	892420643555336193	NaN	NaN	2017-08-01 16:23:56 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU	NaN	NaN	NaN	https://twitter.com/dog_rates/status/892420643555336193/photo/1	13	10	Phineas	None	None	None	None
1	892177421306343426	NaN	NaN	2017-08-01 00:17:27 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 https://t.co/0Xxu71qeIV	NaN	NaN	NaN	https://twitter.com/dog_rates/status/892177421306343426/photo/1	13	10	Tilly	None	None	None	None
2	891815181378084864	NaN	NaN	2017-07-31 00:18:03 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	This is Archie. He is a rare Norwegian Pouncing Corgo. Lives in the tall grass. You never know when one may strike. 12/10 https://t.co/wUnZnhtVJB	NaN	NaN	NaN	https://twitter.com/dog_rates/status/891815181378084864/photo/1	12	10	Archie	None	None	None	None
3	891689557279858688	NaN	NaN	2017-07-30 15:58:51 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	This is Darla. She commenced a snooze mid meal. 13/10 happens to the best of us https://t.co/tD36da7qLQ	NaN	NaN	NaN	https://twitter.com/dog_rates/status/891689557279858688/photo/1	13	10	Darla	None	None	None	None
4	891327558926688256	NaN	NaN	2017-07-29 16:00:24 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	This is Franklin. He would like you to stop calling him "cute." He is a very fierce shark and should be respected as such. 12/10 #BarkWeek https://t.co/AtUZn91f7f	NaN	NaN	NaN	https://twitter.com/dog_rates/status/891327558926688256/photo/1,https://twitter.com/dog_rates/status/891327558926688256/photo/1	12	10	Franklin	None	None	None	None

	tweet_id	timestamp	source	text	expanded_urls	rating_numerator	rating_denominator	name	dog_stage	jpg_url	img_num	p1	p1_conf	p1_dog	p2	p2_conf	p2_dog	p3	p3_conf	p3_dog
0	667435689202614272	2015-11-19 20:14:03 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	Ermergerd 12/10 https://t.co/PQni2sjPsm	https://twitter.com/dog_rates/status/667435689202614272/photo/1	12	10	None	None	https://pbs.twimg.com/media/CUM1OHCW4AEgGSi.jpg	1	Rottweiler	0.999091	True	miniature_pinscher	0.000450	True	black-and-tan_coonhound	0.000157	True
1	667437278097252352	2015-11-19 20:20:22 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	Never seen this breed before. Very pointy pup. Hurts when you cuddle. Still cute tho. 10/10 https://t.co/97HuBrVuOx	https://twitter.com/dog_rates/status/667437278097252352/photo/1	10	10	None	None	https://pbs.twimg.com/media/CUM2qWaWoAUZ06L.jpg	1	porcupine	0.989154	False	bath_towel	0.006300	False	badger	0.000966	False
2	667443425659232256	2015-11-19 20:44:47 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	Exotic dog here. Long neck. Weird paws. Obsessed with bread. Waddles. Flies sometimes (wow!). Very happy dog. 6/10 https://t.co/rqO4I3nf2N	https://twitter.com/dog_rates/status/667443425659232256/photo/1	6	10	None	None	https://pbs.twimg.com/media/CUM8QZwW4AAVsBl.jpg	1	goose	0.980815	False	drake	0.006918	False	hen	0.005255	False
3	667453023279554560	2015-11-19 21:22:56 +0000	<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>	Meet Cupcake. I would do unspeakable things for Cupcake. 11/10 https://t.co/6uLCWR9Efa	https://twitter.com/dog_rates/status/667453023279554560/photo/1	11	10	Cupcake	None	https://pbs.twimg.com/media/CUNE_OSUwAAdHhX.jpg	1	Labrador_retriever	0.825670	True	French_bulldog	0.056639	True	Staffordshire_bullterrier	0.054018	True
4	667455448082227200	2015-11-19 21:32:34 +0000	<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>	This is Reese and Twips. Reese protects Twips. Both think they're too good for seat belts. Simply reckless. 7/10s https://t.co/uLzRi1drVK	https://twitter.com/dog_rates/status/667455448082227200/photo/1	7	10	Reese	None	https://pbs.twimg.com/media/CUNHMXTU8AAS3HH.jpg	1	Tibetan_terrier	0.676376	True	Irish_terrier	0.054933	True	Yorkshire_terrier	0.040576	True

	tweet_id	timestamp	source	text	expanded_urls	rating_numerator	rating_denominator	name	dog_stage	jpg_url	...	p1_conf	p1_dog	p2	p2_conf	p2_dog	p3	p3_conf	p3_dog	retweet_count	favorite_count
1330	829501995190984704	2017-02-09 01:27:41+00:00	Twitter for iPhone	This is Leo. He was a skater pup. She said see ya later pup. He wasn't good enough for her. 12/10 you're good enough for me Leo https://t.co/Xw9JbJHTul	https://twitter.com/dog_rates/status/829501995190984704/photo/1,https://twitter.com/dog_rates/status/829501995190984704/photo/1	12	10	Leo	None	https://pbs.twimg.com/media/C4L7p19W8AA3Fs_.jpg	...	0.950851	True	Pekinese	0.015200	True	pug	0.011094	True	12224	34913
1332	811627233043480576	2016-12-21 17:39:46+00:00	Twitter for iPhone	This is Augie. He's a savage. Doesn't give a h*ck about your garden. Still 10/10 would forgive then pet https://t.co/IU8S0n4oxn	https://twitter.com/dog_rates/status/811627233043480576/photo/1	10	10	Augie	None	https://pbs.twimg.com/media/C0N6opSXAAAkCtN.jpg	...	0.396280	True	Pembroke	0.049562	True	wire-haired_fox_terrier	0.046349	True	3650	14265

	tweet_id	p1	popularity	retweet_count	favorite_count
0	667435689202614272	rottweiler	415	89	326
3	667453023279554560	labrador retriever	423	96	327
4	667455448082227200	tibetan terrier	269	66	203
6	667491009379606528	borzoi	801	242	559
8	667502640335572993	labrador retriever	794	231	563

An Analysis of WeRateDog Tweets¶

Project Details:¶

Table of Contents¶

Introduction¶

Part I: Gathering Data¶

Part II: Assessing Data¶

(1) Twitter Archive Data¶

(2) Image Predictions¶

(3) API data¶

Assesment Summary: Issues List¶

Quality¶

df_twitter_achive dataframe¶

df_image_predictions dataframe¶

df_twitter_api_data dataframe¶

Tidiness¶

df_twitter_achive dataframe¶

df_image_predictions dataframe¶

df_twitter_api_data dataframe¶

Part III: Cleaning Data¶

(1) Missing Data¶

Issue: Replies and retweets are included in our archive dataset.¶

Define¶

Code¶

Test¶

Issue: Tweets have no photo are included in our archive dataset.¶

Define¶

Code¶

Test¶

Issue: Not all tweets in the archive have an image prediction.¶

Issue: Not all tweets in the archive have associated api data.¶

(2) Tidiness¶

Issue: The columns associated with replies and retweets should be removed from the twitter archive.¶

Define¶

Code¶

Test¶

Issue: The doggo, floofer, pupper, and puppo columns of the image predictions dataset should be converted into a single dog_stage column.¶

Define¶

Code¶

Test¶

Issue: The image predictions dataset should be merged into the larger tweet archive, joined on tweet_id¶

Define¶

Code¶

Test¶

Issue: The api data should be merged into the larger tweet archive, joined on tweet_id (which needs to be converted to an integer type first)¶

Define¶

Code¶

Test¶

(3) Quality¶

Issue: timestamp has the wrong data type (it should be datetime and not text)¶

Define¶

Code¶

Test¶

Issue: retweet_count and favorite_count should be converted to integer format¶

Define¶

Code¶

Test¶

Issue: source column contains unnecessary link information. We can strip it and just keep the actual app name.¶

Define¶

Code¶

Test¶

Issue: name contains erroneous values ("a", "such", "None")¶

Define¶

Code¶

Test¶

Issue: rating_numerator and rating_denominator were not always extracted correctly, especially if there was a date or percentage preceding the rating in the tweet.¶

Define¶

Code¶

Test¶

Issue: The p1, p2 and p3 columns are dirty (i.e. capitalization is inconsistent and there are underscores between words.)¶

Define¶

Code¶

Test¶

Part IV: Storing, Analyzing and Visualizing Data¶

(1) Store Data¶

(2) Analyze and Visualize Data¶

Question (1): What is the average rating?¶

Question (2): Have rating trends changed over time?¶

Question (3): Do certain dog breeds have higher ratings?¶

Question (4): Based on retweets and favorites, what dog breeds are the most popular?¶

Conclusion¶

`df_twitter_achive` dataframe¶

`df_image_predictions` dataframe¶

`df_twitter_api_data` dataframe¶

`df_twitter_achive` dataframe¶

`df_image_predictions` dataframe¶

`df_twitter_api_data` dataframe¶

Issue: The `doggo`, `floofer`, `pupper`, and `puppo` columns of the image predictions dataset should be converted into a single `dog_stage` column.¶

Issue: The image predictions dataset should be merged into the larger tweet archive, joined on `tweet_id`¶

Issue: The api data should be merged into the larger tweet archive, joined on `tweet_id` (which needs to be converted to an integer type first)¶

Issue: `timestamp` has the wrong data type (it should be datetime and not text)¶

Issue: `retweet_count` and `favorite_count` should be converted to integer format¶

Issue: `source` column contains unnecessary link information. We can strip it and just keep the actual app name.¶

Issue: `name` contains erroneous values ("a", "such", "None")¶

Issue: `rating_numerator` and `rating_denominator` were not always extracted correctly, especially if there was a date or percentage preceding the rating in the tweet.¶

Issue: The `p1`, `p2` and `p3` columns are dirty (i.e. capitalization is inconsistent and there are underscores between words.)¶