import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import requests
import os
import tweepy
from tweepy import OAuthHandler
import json
from timeit import default_timer as timer

%matplotlib inline


df_twitter_archive = pd.read_csv('twitter-archive-enhanced.csv')


url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)

with open(url.split('/')[-1], mode='wb') as file:
    file.write(response.content)


df_img = pd.read_csv('image-predictions.tsv', sep='\t')


# Query Twitter API for each tweet in the Twitter archive and save JSON in a text file
consumer_key = ''
consumer_secret = ''
access_token = ''
access_secret = ''

auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True)


# Query Twitter's API for JSON data for each tweet ID in the Twitter archive
count = 0
fails_dict = {}
start = timer()
# Save each tweet's returned JSON as a new line in a .txt file
with open('tweet_json.txt', 'w') as outfile:
    # This loop will likely take 20-30 minutes to run because of Twitter's rate limit
    for tweet_id in tweet_ids:
        count += 1
        print(str(count) + ": " + str(tweet_id))
        try:
            tweet = api.get_status(tweet_id, tweet_mode='extended')
            print("Success")
            json.dump(tweet._json, outfile)
            outfile.write('\n')
        except tweepy.TweepError as e:
            print("Fail")
            fails_dict[tweet_id] = e
            pass
end = timer()
print(end - start)
print(fails_dict)


df_tweet_json = pd.read_json('tweet-json.txt', lines=True)


df_twitter_archive.sample(5)


df_twitter_archive.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 non-null   object 
 14  floofer                     2356 non-null   object 
 15  pupper                      2356 non-null   object 
 16  puppo                       2356 non-null   object 
dtypes: float64(4), int64(3), object(10)
memory usage: 313.0+ KB


#check for incorrect or missing values in the twitter-archive-enhanced.csv name column
arr = np.empty([0], dtype='int16')
for i in range(0, df_twitter_archive['name'].values.size):
    if df_twitter_archive['name'].values[i] not in df_twitter_archive['text'].values[i]:
        arr = np.append(arr, [i])
arr.size

743


# Check for instances where incorrect 'rating_denominator' exist
df_twitter_archive[df_twitter_archive['rating_denominator'] != 10].rating_denominator

313       0
342      15
433      70
516       7
784      11
902     150
1068     11
1120    170
1165     20
1202     50
1228     90
1254     80
1274     50
1351     50
1433     40
1598     20
1634    130
1635    110
1662     11
1663     16
1779    120
1843     80
2335      2
Name: rating_denominator, dtype: int64


#check for duplicated records in_enhanced data twitter_archive_enhanced data
df_twitter_archive[df_twitter_archive.duplicated()]


#check for duplicate columns in different tables - twitter-archive-enhanced and tweet_json
all_columns = pd.Series(list(df_twitter_archive) + list(df_tweet_json))
all_columns[all_columns.duplicated()]

25                   source
26    in_reply_to_status_id
28      in_reply_to_user_id
dtype: object


#Store df_tweet_json in a csv file named 'tweet_json.csv'
df_tweet_json.to_csv('tweet_json.csv')


# View the structure of image-predictions data
df_img.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


#check for duplicate records in image-prediction data
df_img[df_img.duplicated()]


df_img.sample(5)


#check number of records that have same values in columns id and id_str in tweet_json data
df_tweet_json[df_tweet_json['id'] == df_tweet_json['id_str']].shape

(1518, 31)


# View the structure of tweet_json data
df_tweet_json.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2354 entries, 0 to 2353
Data columns (total 31 columns):
 #   Column                         Non-Null Count  Dtype              
---  ------                         --------------  -----              
 0   created_at                     2354 non-null   datetime64[ns, UTC]
 1   id                             2354 non-null   int64              
 2   id_str                         2354 non-null   int64              
 3   full_text                      2354 non-null   object             
 4   truncated                      2354 non-null   bool               
 5   display_text_range             2354 non-null   object             
 6   entities                       2354 non-null   object             
 7   extended_entities              2073 non-null   object             
 8   source                         2354 non-null   object             
 9   in_reply_to_status_id          78 non-null     float64            
 10  in_reply_to_status_id_str      78 non-null     float64            
 11  in_reply_to_user_id            78 non-null     float64            
 12  in_reply_to_user_id_str        78 non-null     float64            
 13  in_reply_to_screen_name        78 non-null     object             
 14  user                           2354 non-null   object             
 15  geo                            0 non-null      float64            
 16  coordinates                    0 non-null      float64            
 17  place                          1 non-null      object             
 18  contributors                   0 non-null      float64            
 19  is_quote_status                2354 non-null   bool               
 20  retweet_count                  2354 non-null   int64              
 21  favorite_count                 2354 non-null   int64              
 22  favorited                      2354 non-null   bool               
 23  retweeted                      2354 non-null   bool               
 24  possibly_sensitive             2211 non-null   float64            
 25  possibly_sensitive_appealable  2211 non-null   float64            
 26  lang                           2354 non-null   object             
 27  retweeted_status               179 non-null    object             
 28  quoted_status_id               29 non-null     float64            
 29  quoted_status_id_str           29 non-null     float64            
 30  quoted_status                  28 non-null     object             
dtypes: bool(4), datetime64[ns, UTC](1), float64(11), int64(4), object(11)
memory usage: 505.9+ KB


df_tweet_json.sample(5)


# Make copies of original pieces of data
df_twitter_archive_copy = df_twitter_archive.copy()
df_tweet_json_copy = df_tweet_json.copy()
df_img_copy = df_img.copy()


df_twitter_archive[['expanded_urls', 'retweeted_status_timestamp', 'retweeted_status_user_id', 'retweeted_status_id', 'in_reply_to_user_id', 'in_reply_to_status_id']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 6 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   expanded_urls               2297 non-null   object 
 1   retweeted_status_timestamp  181 non-null    object 
 2   retweeted_status_user_id    181 non-null    float64
 3   retweeted_status_id         181 non-null    float64
 4   in_reply_to_user_id         78 non-null     float64
 5   in_reply_to_status_id       78 non-null     float64
dtypes: float64(4), object(2)
memory usage: 110.6+ KB


df_twitter_archive_copy.drop(['expanded_urls', 'retweeted_status_timestamp', 'retweeted_status_user_id', 'retweeted_status_id', 'in_reply_to_user_id', 'in_reply_to_status_id'], axis='columns', inplace=True)


df_twitter_archive_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   tweet_id            2356 non-null   int64 
 1   timestamp           2356 non-null   object
 2   source              2356 non-null   object
 3   text                2356 non-null   object
 4   rating_numerator    2356 non-null   int64 
 5   rating_denominator  2356 non-null   int64 
 6   name                2356 non-null   object
 7   doggo               2356 non-null   object
 8   floofer             2356 non-null   object
 9   pupper              2356 non-null   object
 10  puppo               2356 non-null   object
dtypes: int64(3), object(8)
memory usage: 202.6+ KB


df_twitter_archive_copy['timestamp'] = pd.to_datetime(df_twitter_archive_copy['timestamp'])


df_twitter_archive_copy['timestamp'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 2356 entries, 0 to 2355
Series name: timestamp
Non-Null Count  Dtype              
--------------  -----              
2356 non-null   datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1)
memory usage: 18.5 KB


count = 0
for i in arr1:
    print(str(arr1[count]) + ' ' + df_twitter_archive_copy.iloc[i].text + '\n')
    count += 1

313 @jonnysun @Lin_Manuel ok jomny I know you're excited but 960/00 isn't a valid rating, 13/10 is tho

342 @docmisterio account started on 11/15/15

433 The floofs have been released I repeat the floofs have been released. 84/70 https://t.co/NIYC820tmd

516 Meet Sam. She smiles 24/7 &amp; secretly aspires to be a reindeer. 
Keep Sam smiling by clicking and sharing this link:
https://t.co/98tB8y7y7t https://t.co/LouL5vdvxx

784 RT @dog_rates: After so many requests, this is Bretagne. She was the last surviving 9/11 search dog, and our second ever 14/10. RIP https:/…

902 Why does this never happen at my front door... 165/150 https://t.co/HmwrdfEfUE

1068 After so many requests, this is Bretagne. She was the last surviving 9/11 search dog, and our second ever 14/10. RIP https://t.co/XAVDNDaVgQ

1120 Say hello to this unbelievably well behaved squad of doggos. 204/170 would try to pet all at once https://t.co/yGQI3He3xv

1165 Happy 4/20 from the squad! 13/10 for all https://t.co/eV1diwds8a

1202 This is Bluebert. He just saw that both #FinalFur match ups are split 50/50. Amazed af. 11/10 https://t.co/Kky1DPG4iq

1228 Happy Saturday here's 9 puppers on a bench. 99/90 good work everybody https://t.co/mpvaVxKmc1

1254 Here's a brigade of puppers. All look very prepared for whatever happens next. 80/80 https://t.co/0eb7R1Om12

1274 From left to right:
Cletus, Jerome, Alejandro, Burp, &amp; Titson
None know where camera is. 45/50 would hug all at once https://t.co/sedre1ivTK

1351 Here is a whole flock of puppers.  60/50 I'll take the lot https://t.co/9dpcw6MdWa

1433 Happy Wednesday here's a bucket of pups. 44/40 would pet all at once https://t.co/HppvrYuamZ

1598 Yes I do realize a rating of 4/20 would've been fitting. However, it would be unjust to give these cooperative pups that low of a rating

1634 Two sneaky puppers were not initially seen, moving the rating to 143/130. Please forgive us. Thank you https://t.co/kRK51Y5ac3

1635 Someone help the girl is being mugged. Several are distracting her while two steal her shoes. Clever puppers 121/110 https://t.co/1zfnTJLt55

1662 This is Darrel. He just robbed a 7/11 and is in a high speed police chase. Was just spotted by the helicopter 10/10 https://t.co/7EsP8LmSp5

1663 I'm aware that I could've said 20/16, but here at WeRateDogs we are very professional. An inconsistent rating scale is simply irresponsible

1779 IT'S PUPPERGEDDON. Total of 144/120 ...I think https://t.co/ZanVtAtvIq

1843 Here we have an entire platoon of puppers. Total score: 88/80 would pet all at once https://t.co/y93p6FLvVw

2335 This is an Albanian 3 1/2 legged  Episcopalian. Loves well-polished hardwood flooring. Penis on the collar. 9/10 https://t.co/d9NcXFKwLv


# 313 @jonnysun @Lin_Manuel ok jomny I know you're excited but 960/00 isn't a valid rating, 13/10 
#is tho
df_twitter_archive_copy.at[313, 'rating_numerator']= 13
df_twitter_archive_copy.at[313, 'rating_denominator']= 10


# 784 RT @dog_rates: After so many requests, this is Bretagne. She was the last surviving 9/11 
#search dog, and our second ever 14/10. RIP https:/…
df_twitter_archive_copy.at[784, 'rating_numerator']= 14
df_twitter_archive_copy.at[784, 'rating_denominator']= 10


# 1165 Happy 4/20 from the squad! 13/10 for all https://t.co/eV1diwds8a
df_twitter_archive_copy.at[1165, 'rating_numerator']= 13
df_twitter_archive_copy.at[1165, 'rating_denominator']= 10


# 1202 This is Bluebert. He just saw that both #FinalFur match ups are split 50/50. Amazed af. 
#11/10 https://t.co/Kky1DPG4iq
df_twitter_archive_copy.at[1202, 'rating_numerator']= 11
df_twitter_archive_copy.at[1202, 'rating_denominator']= 10


# 1662 This is Darrel. He just robbed a 7/11 and is in a high speed police chase. Was just spotted 
#by the helicopter 10/10 https://t.co/7EsP8LmSp5
df_twitter_archive_copy.at[1662, 'rating_numerator']= 10
df_twitter_archive_copy.at[1662, 'rating_denominator']= 10


# 2335 This is an Albanian 3 1/2 legged  Episcopalian. Loves well-polished hardwood flooring. Penis on the collar. 9/10 https://t.co/d9NcXFKwLv
df_twitter_archive_copy.at[2335, 'rating_numerator']= 9
df_twitter_archive_copy.at[2335, 'rating_denominator']= 10


df_twitter_archive_copy.drop([1843, 1663, 1635, 1634, 1598, 1433, 1351, 1274, 1254, 1228, 1120, 902, 516, 433, 342, 1068, 1779], axis='rows', inplace=True)


df_twitter_archive_copy[df_twitter_archive_copy['rating_denominator'] != 10]


df_twitter_archive_copy.shape

(2339, 7)


df_twitter_archive_copy.drop([1695, 759, 973, 188, 187, 2055, 289, 339, 691], axis='rows', inplace=True)


df_twitter_archive_copy.rating_numerator.max()

17


df_twitter_archive_copy[((df_twitter_archive_copy['doggo']=='doggo') & (df_twitter_archive_copy['Dog Stage'] != 'doggo'))].index

Int64Index([], dtype='int64')


df_twitter_archive_copy[((df_twitter_archive_copy['floofer']=='floofer') & (df_twitter_archive_copy['Dog Stage'] != 'floofer'))].index

Int64Index([200], dtype='int64')


df_twitter_archive_copy['text'].iloc[200]

"At first I thought this was a shy doggo, but it's actually a Rare Canadian Floofer Owl. Amateurs would confuse the two. 11/10 only send dogs https://t.co/TXdT3tmuYk"


df_twitter_archive_copy.at[200, 'Dog Stage'] = 'floofer'


df_twitter_archive_copy['Dog Stage'].iloc[200]

'floofer'


df_twitter_archive_copy[((df_twitter_archive_copy['puppo']=='puppo') & (df_twitter_archive_copy['Dog Stage'] != 'puppo'))].index

Int64Index([191], dtype='int64')


df_twitter_archive_copy['text'].iloc[191]

"Here's a puppo participating in the #ScienceMarch. Cleverly disguising her own doggo agenda. 13/10 would keep the planet habitable for https://t.co/cMhq16isel"


df_twitter_archive_copy.at[191, 'Dog Stage'] = 'puppo'


df_twitter_archive_copy['Dog Stage'].iloc[191]

'puppo'


df_twitter_archive_copy[((df_twitter_archive_copy['pupper']=='pupper') & (df_twitter_archive_copy['Dog Stage'] != 'pupper'))].index

Int64Index([460, 531, 565, 575, 705, 733, 778, 822, 889, 956, 1063, 1113], dtype='int64')


df_twitter_archive['text'].iloc[460]

'This is Dido. She\'s playing the lead role in "Pupper Stops to Catch Snow Before Resuming Shadow Box with Dried Apple." 13/10 (IG: didodoggo) https://t.co/m7isZrOBX7'


df_twitter_archive_copy.at[460, 'Dog Stage'] = 'doggo'


df_twitter_archive['text'].iloc[531]

'Here we have Burke (pupper) and Dexter (doggo). Pupper wants to be exactly like doggo. Both 12/10 would pet at same time https://t.co/ANBpEYHaho'


df_twitter_archive['text'].iloc[565]

'Like doggo, like pupper version 2. Both 11/10 https://t.co/9IxWAXFqze'


df_twitter_archive['text'].iloc[575]

"This is Bones. He's being haunted by another doggo of roughly the same size. 12/10 deep breaths pupper everything's fine https://t.co/55Dqe0SJNj"


df_twitter_archive_copy.at[575, 'Dog Stage'] = 'pupper'


df_twitter_archive['text'].iloc[705]

"This is Pinot. He's a sophisticated doggo. You can tell by the hat. Also pointier than your average pupper. Still 10/10 would pet cautiously https://t.co/f2wmLZTPHd"


df_twitter_archive_copy.at[705, 'Dog Stage'] = 'doggo'


df_twitter_archive['text'].iloc[733]

'Pupper butt 1, Doggo 0. Both 12/10 https://t.co/WQvcPEpH2u'


df_twitter_archive['text'].iloc[778]

'RT @dog_rates: Like father (doggo), like son (pupper). Both 12/10 https://t.co/pG2inLaOda'


df_twitter_archive['text'].iloc[822]

'RT @dog_rates: This is just downright precious af. 12/10 for both pupper and doggo https://t.co/o5J479bZUC'


df_twitter_archive['text'].iloc[889]

'Meet Maggie &amp; Lila. Maggie is the doggo, Lila is the pupper. They are sisters. Both 12/10 would pet at the same time https://t.co/MYwR4DQKll'


df_twitter_archive['text'].iloc[956]

"Please stop sending it pictures that don't even have a doggo or pupper in them. Churlish af. 5/10 neat couch tho https://t.co/u2c9c7qSg8"


df_twitter_archive['text'].iloc[1063]

'This is just downright precious af. 12/10 for both pupper and doggo https://t.co/o5J479bZUC'


df_twitter_archive['text'].iloc[1113]

'Like father (doggo), like son (pupper). Both 12/10 https://t.co/pG2inLaOda'


df_twitter_archive_copy['Dog Stage'].iloc[[460, 531, 565, 575, 705, 733, 778, 822, 889, 956, 1063, 1113]]

460      doggo
531      doggo
565      doggo
575     pupper
705      doggo
733      doggo
778      doggo
822      doggo
889      doggo
956      doggo
1063     doggo
1113     doggo
Name: Dog Stage, dtype: object


df_tweet_json_copy.drop(['quoted_status_id_str'], axis='columns', inplace=True)


list(df_tweet_json_copy)

['id',
 'truncated',
 'display_text_range',
 'entities',
 'extended_entities',
 'is_quote_status',
 'retweet_count',
 'favorite_count',
 'favorited',
 'retweeted',
 'lang']


df_tweet_json_copy.drop(['in_reply_to_status_id', 'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place', 'contributors', 'quoted_status', 'quoted_status_id_str',  'quoted_status_id', 'retweeted_status', 'possibly_sensitive_appealable', 'possibly_sensitive'], axis='columns', inplace=True)


list(df_tweet_json_copy)

['created_at',
 'id',
 'id_str',
 'full_text',
 'truncated',
 'display_text_range',
 'entities',
 'extended_entities',
 'source',
 'is_quote_status',
 'retweet_count',
 'favorite_count',
 'favorited',
 'retweeted',
 'lang']


df_tweet_json_copy.drop(['id_str'], axis='columns', inplace=True)


df_tweet_json_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2354 entries, 0 to 2353
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  2354 non-null   int64 
 1   truncated           2354 non-null   bool  
 2   display_text_range  2354 non-null   object
 3   entities            2354 non-null   object
 4   extended_entities   2073 non-null   object
 5   is_quote_status     2354 non-null   bool  
 6   retweet_count       2354 non-null   int64 
 7   favorite_count      2354 non-null   int64 
 8   favorited           2354 non-null   bool  
 9   retweeted           2354 non-null   bool  
 10  lang                2354 non-null   object
dtypes: bool(4), int64(3), object(4)
memory usage: 138.1+ KB


# construct a single column 'Dog Stage' from twitter-archive-enhanced - doggo, floofer, pupper, puppo
#columns
df_twitter_archive_copy['Dog Stage'] = np.nan
max_row = df_twitter_archive_copy.shape[0]
for i in range(0, max_row):
    if df_twitter_archive_copy.iloc[i]['doggo'] == 'doggo':
        df_twitter_archive_copy.at[i, 'Dog Stage']= 'doggo'
    elif df_twitter_archive_copy.iloc[i]['floofer'] == 'floofer':
        df_twitter_archive_copy.at[i, 'Dog Stage']= 'floofer'
    elif df_twitter_archive_copy.iloc[i]['pupper'] == 'pupper':
        df_twitter_archive_copy.at[i, 'Dog Stage']= 'pupper'
    elif df_twitter_archive_copy.iloc[i]['puppo'] == 'puppo':
        df_twitter_archive_copy.at[i, 'Dog Stage']= 'puppo'


df_twitter_archive_copy.drop(['doggo', 'floofer', 'pupper', 'puppo'], axis='columns', inplace=True)


df_twitter_archive_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   tweet_id            2356 non-null   int64              
 1   timestamp           2356 non-null   datetime64[ns, UTC]
 2   source              2356 non-null   object             
 3   text                2356 non-null   object             
 4   rating_numerator    2356 non-null   int64              
 5   rating_denominator  2356 non-null   int64              
 6   Dog Stage           380 non-null    object             
dtypes: datetime64[ns, UTC](1), int64(3), object(3)
memory usage: 129.0+ KB


df_tweet_json_copy.drop(['source', 'created_at', 'full_text'], axis='columns', inplace=True)


df_tweet_json_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2354 entries, 0 to 2353
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  2354 non-null   int64 
 1   id_str              2354 non-null   int64 
 2   truncated           2354 non-null   bool  
 3   display_text_range  2354 non-null   object
 4   entities            2354 non-null   object
 5   extended_entities   2073 non-null   object
 6   is_quote_status     2354 non-null   bool  
 7   retweet_count       2354 non-null   int64 
 8   favorite_count      2354 non-null   int64 
 9   favorited           2354 non-null   bool  
 10  retweeted           2354 non-null   bool  
 11  lang                2354 non-null   object
dtypes: bool(4), int64(4), object(4)
memory usage: 156.4+ KB


# merge df_twitter_archive_copy with df_tweet_json_copy 
df_merge1 = pd.merge(df_twitter_archive_copy, df_tweet_json_copy, left_on='tweet_id', right_on='id', how='inner')


# merge df_merge1 with df_img_copy
df_merge2 = pd.merge(df_merge1, df_img_copy, left_on='tweet_id', right_on='tweet_id', how='left')


# Delete id column which is a duplicate of tweet_id
df_merge2.drop(['id'], axis='columns', inplace=True)


#Store df_merge2 in a csv file named 'twitter_archive_master.csv'
df_merge2.to_csv('twitter_archive_master.csv')


df_merge2 = pd.read_csv('twitter_archive_master.csv')


df_merge2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2328 entries, 0 to 2327
Data columns (total 28 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   tweet_id            2328 non-null   int64  
 1   timestamp           2328 non-null   object 
 2   source              2328 non-null   object 
 3   text                2328 non-null   object 
 4   rating_numerator    2328 non-null   int64  
 5   rating_denominator  2328 non-null   int64  
 6   Dog Stage           378 non-null    object 
 7   truncated           2328 non-null   bool   
 8   display_text_range  2328 non-null   object 
 9   entities            2328 non-null   object 
 10  extended_entities   2054 non-null   object 
 11  is_quote_status     2328 non-null   bool   
 12  retweet_count       2328 non-null   int64  
 13  favorite_count      2328 non-null   int64  
 14  favorited           2328 non-null   bool   
 15  retweeted           2328 non-null   bool   
 16  lang                2328 non-null   object 
 17  jpg_url             2054 non-null   object 
 18  img_num             2054 non-null   float64
 19  p1                  2054 non-null   object 
 20  p1_conf             2054 non-null   float64
 21  p1_dog              2054 non-null   object 
 22  p2                  2054 non-null   object 
 23  p2_conf             2054 non-null   float64
 24  p2_dog              2054 non-null   object 
 25  p3                  2054 non-null   object 
 26  p3_conf             2054 non-null   float64
 27  p3_dog              2054 non-null   object 
dtypes: bool(4), float64(4), int64(5), object(15)
memory usage: 445.7+ KB


#compute the statistical summary of rating_numerator, retweet_count, favorite_count variables of interest
df_merge2[['rating_numerator', 'retweet_count', 'favorite_count']].describe()


#plot a histogram depicting the distribution of rating_numerator variable
bins = np.arange(0, df_merge2['rating_numerator'].max()+1, 1)
sns.displot(df_merge2['rating_numerator'], bins = bins, kde=False)
plt.xticks(np.arange(0, df_merge2['rating_numerator'].max()+1, 1))
plt.title('Histogram depicting the distribution of rating_numerator variable')
plt.savefig('images/hist_rating.png', dpi=75, bbox_inches='tight');


df_merge2['rating_numerator'].value_counts()

12    557
11    464
10    462
13    352
9     157
8     102
14     55
7      54
5      37
6      32
3      19
4      15
2       9
1       8
0       2
15      2
17      1
Name: rating_numerator, dtype: int64


#plot a histogram depicting the distribution of retweet_count variable
bins = np.arange(0, df_merge2['retweet_count'].max()+5, 2500)
sns.displot(df_merge2['retweet_count'], bins = bins, kde=False)
plt.xticks(np.arange(0, df_merge2['retweet_count'].max()+5, 5000), rotation=90)
plt.title('Histogram depicting the distribution of retweet_count variable')
plt.savefig('images/hist_retweet.png', dpi=75, bbox_inches='tight');


#plot a histogram depicting the distribution of favorite_count variable
bins = np.arange(0, df_merge2['favorite_count'].max()+5, 5000)
sns.displot(df_merge2['favorite_count'], bins = bins, kde=False)
plt.xticks(np.arange(0, df_merge2['favorite_count'].max()+5, 10000), rotation=90)
plt.title('Histogram depicting the distribution of favorite_count variable')
plt.savefig('images/hist_favorite.png', dpi=75, bbox_inches='tight');


#plot a plot matrix to get the broad overview of pair-wise relationships between variables
df_stats = ['rating_numerator', 'retweet_count', 'favorite_count']
grid = sns.PairGrid(data=df_merge2, vars=df_stats)
grid=grid.map_offdiag(plt.scatter)
grid.map_diag(plt.hist);


#plot a heatmap depicting the correlation between retweet_count, rating_numerator and favorite_count
sns.heatmap(df_merge2[df_stats].corr(), cmap='rocket_r', annot=True, fmt='.2f')
plt.title('Heatmap depicting the correlation between retweet_count, rating_numerator and favorite_count');
plt.savefig('images/heat_map.png', dpi=75, bbox_inches='tight');


#plot a scatter plot of retweet_count against favorite_count
plt.figure(figsize=(5,5))
plt.scatter(data=df_merge2, x='retweet_count', y='favorite_count')
plt.xlabel('retweet_count')
plt.ylabel('favorite_count')
plt.title('Scatter Plot of retweet_count against favorite_count')
plt.savefig('images/scatter.png', dpi=75, bbox_inches='tight');


df_merge2[df_merge2['p1_dog'] == True]['p1'].value_counts()

golden_retriever      146
Labrador_retriever     97
Pembroke               88
Chihuahua              83
pug                    57
                     ... 
Scotch_terrier          1
EntleBucher             1
Japanese_spaniel        1
standard_schnauzer      1
silky_terrier           1
Name: p1, Length: 110, dtype: int64


df_merge2[df_merge2['p2_dog'] == True]['p2'].value_counts()

Labrador_retriever      104
golden_retriever         90
Cardigan                 73
Chihuahua                43
Pomeranian               42
                       ... 
Kerry_blue_terrier        1
Bernese_mountain_dog      1
EntleBucher               1
standard_schnauzer        1
komondor                  1
Name: p2, Length: 113, dtype: int64


df_merge2[df_merge2['p3_dog'] == True]['p3'].value_counts()

Labrador_retriever    77
Chihuahua             58
golden_retriever      47
Eskimo_dog            38
kelpie                35
                      ..
Sealyham_terrier       2
affenpinscher          1
Sussex_spaniel         1
Kerry_blue_terrier     1
standard_schnauzer     1
Name: p3, Length: 116, dtype: int64


(df_merge2[(df_merge2['p1_dog'] == True) & (df_merge2['p1_conf'] >= 0.5)]['p1_conf'].size)/(df_merge2[(df_merge2['p1_dog'] == True)]['p1_conf'].size)

0.6352242744063324


df_merge2[(df_merge2['p2_dog'] == True) & (df_merge2['p2_conf'] >= 0.5)]['p2_conf'].size

0


df_merge2[(df_merge2['p3_dog'] == True) & (df_merge2['p3_conf'] >= 0.5)]['p3_conf'].size

0


(df_merge2[df_merge2['p1_dog'] == True]['p1'].value_counts().size)/(df_merge2['p1_dog'].size)

0.04725085910652921


(df_merge2[df_merge2['p2_dog'] == True]['p2'].value_counts().size)/(df_merge2['p2_dog'].size)

0.048539518900343644


(df_merge2[df_merge2['p3_dog'] == True]['p3'].value_counts().size)/(df_merge2['p3_dog'].size)

0.04982817869415808

	tweet_id	in_reply_to_status_id	in_reply_to_user_id	timestamp	source	text	retweeted_status_id	retweeted_status_user_id	retweeted_status_timestamp	expanded_urls	rating_numerator	rating_denominator	name	doggo	floofer	pupper	puppo
1533	689999384604450816	NaN	NaN	2016-01-21 02:34:07 +0000	<a href="http://twitter.com/download/iphone" r...	This is Covach. He's trying to melt the snow. ...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/689999384...	10	10	Covach	None	None	None	None
110	871102520638267392	NaN	NaN	2017-06-03 20:33:19 +0000	<a href="http://twitter.com/download/iphone" r...	Never doubt a doggo 14/10 https://t.co/AbBLh2FZCH	NaN	NaN	NaN	https://twitter.com/animalcog/status/871075758...	14	10	None	doggo	None	None	None
571	801538201127157760	NaN	NaN	2016-11-23 21:29:33 +0000	<a href="http://twitter.com/download/iphone" r...	This is Wallace. He'll be your chau-fur this e...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/801538201...	12	10	Wallace	None	None	None	None
1154	725458796924002305	NaN	NaN	2016-04-27 22:57:10 +0000	<a href="http://twitter.com/download/iphone" r...	Pup had to be removed cuz it wouldn't have bee...	NaN	NaN	NaN	https://twitter.com/foxdeportes/status/7251360...	13	10	None	None	None	None	None
487	814153002265309185	NaN	NaN	2016-12-28 16:56:16 +0000	<a href="http://twitter.com/download/iphone" r...	This is Oshie. He's ready to party. Bought tha...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/814153002...	12	10	Oshie	None	None	None	None

	tweet_id	jpg_url	img_num	p1	p1_conf	p1_dog	p2	p2_conf	p2_dog	p3	p3_conf	p3_dog
124	668237644992782336	https://pbs.twimg.com/media/CUYOl0kW4AAVe_p.jpg	1	chow	0.809320	True	minivan	0.071311	False	Pekinese	0.037870	True
1942	861288531465048066	https://pbs.twimg.com/ext_tw_video_thumb/86128...	1	syringe	0.144712	False	oxygen_mask	0.106684	False	Bouvier_des_Flandres	0.082610	True
2033	883482846933004288	https://pbs.twimg.com/media/DELC9dZXUAADqUk.jpg	1	golden_retriever	0.943082	True	Labrador_retriever	0.032409	True	kuvasz	0.005501	True
1462	778383385161035776	https://pbs.twimg.com/media/Cs1fjyqWIAE2jop.jpg	1	collie	0.345266	True	borzoi	0.312823	True	Border_collie	0.213011	True
12	666073100786774016	https://pbs.twimg.com/media/CT5d9DZXAAALcwe.jpg	1	Walker_hound	0.260857	True	English_foxhound	0.175382	True	Ibizan_hound	0.097471	True

	created_at	id	id_str	full_text	truncated	display_text_range	entities	extended_entities	source	in_reply_to_status_id	...	favorite_count	favorited	retweeted	lang	retweeted_status	quoted_status_id	quoted_status_id_str	quoted_status
1096	2016-05-27 01:47:23+00:00	736010884653420544	736010884653420544	Right after you graduate vs when you remember ...	False	[0, 115]	{'hashtags': [], 'symbols': [], 'user_mentions...	{'media': [{'id': 736010860062248961, 'id_str'...	<a href="http://twitter.com/download/iphone" r...	NaN	...	8652	False	False	en	NaN	NaN	NaN	NaN
125	2017-05-27 19:39:34+00:00	868552278524837888	868552278524837888	Say hello to Cooper. His expression is the sam...	False	[0, 139]	{'hashtags': [], 'symbols': [], 'user_mentions...	{'media': [{'id': 868552270358618113, 'id_str'...	<a href="http://twitter.com/download/iphone" r...	NaN	...	10539	False	False	en	NaN	NaN	NaN	NaN
545	2016-12-05 17:16:37+00:00	805823200554876929	805823200554876928	RT @dog_rates: This is Reginald. He's one magi...	False	[0, 114]	{'hashtags': [], 'symbols': [], 'user_mentions...	NaN	<a href="http://twitter.com/download/iphone" r...	NaN	...	0	False	False	en	{'created_at': 'Fri Oct 07 00:06:50 +0000 2016...	NaN	NaN	NaN
1068	2016-06-08 01:45:19+00:00	740359016048689152	740359016048689152	This is Chase. He's in a predicament. 9/10 hel...	False	[0, 67]	{'hashtags': [], 'symbols': [], 'user_mentions...	{'media': [{'id': 740359008121298944, 'id_str'...	<a href="http://twitter.com/download/iphone" r...	NaN	...	3610	False	False	en	NaN	NaN	NaN	NaN
995	2016-06-30 01:15:31+00:00	748324050481647620	748324050481647616	This is Duke. He permanently looks like he jus...	False	[0, 77]	{'hashtags': [], 'symbols': [], 'user_mentions...	{'media': [{'id': 748324042759938048, 'id_str'...	<a href="http://twitter.com/download/iphone" r...	NaN	...	4078	False	False	en	NaN	NaN	NaN	NaN

	rating_numerator	retweet_count	favorite_count
count	2328.000000	2328.000000	2328.000000
mean	10.707904	3177.306701	8115.581615
std	2.179659	5307.060695	11862.930533
min	0.000000	0.000000	0.000000
25%	10.000000	625.500000	1413.750000
50%	11.000000	1477.000000	3607.000000
75%	12.000000	3652.000000	10185.250000
max	17.000000	79515.000000	132810.000000

Project: Wrangle and Analyze Twitter-Archive Data¶

Data Gathering¶

Assessing Data¶

Quality issues¶

Tidiness issues¶

Cleaning Data¶

Issue #1:¶

Define:¶

Code¶

Test¶

Issue #2:¶

Define¶

Code¶

Test¶

Issue #3:¶

Define¶

Code¶

Test¶

Issue #4:¶

Define¶

Code¶

Test¶

Issue #5:¶

Define¶

Code¶

Test¶

Issue #6:¶

Define¶

Code¶

Test¶

Issue #7:¶

Define¶

Code¶

Test¶

Issue #8:¶

Define¶

Code¶

Test¶

Issue #9:¶

Define¶

Code¶

Test¶

Issue #10:¶

Define¶

Code¶

Test¶

Storing Data¶

Analyzing and Visualizing Data¶

Insights:¶

1. Distribution of Variables of Interest¶

2. Relationship between Variables of interest¶

3. Analysis of Image Predictions¶

Most Frequent Dog Breeds¶

Visualization¶