Commit c923e821 authored by Sina's avatar Sina
Browse files

mergin vis with main

parent 031fc01c
%% Cell type:code id:790abd3e tags:
 
``` python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
```
 
%% Cell type:code id:1c15fe13 tags:
 
``` python
df_train = pd.read_csv('./train.csv',index_col = 'id')
df_test = pd.read_csv('./test.csv',index_col = 'id')
df_train.sample(5)
```
 
%% Output
 
keyword location \
id
4020 disaster Hinton, W.Va.
6163 hijack NaN
2691 crush NaN
4021 disaster NaN
4457 electrocuted Edinburgh
text target
id
4020 Jeff Locke. Train wreck. F'in disaster. Fortun... 1
6163 0-day bug in fully patched OS X comes under ac... 0
2691 Only had a crush on one girl in high school an... 0
4021 #Metepec #Mexico - ?NIGHT DISASTER?...E(Oficia... 1
4457 @That_fat_guy there's literally a video of an ... 0
 
%% Cell type:code id:1c257c3c tags:
 
``` python
fig, axes = plt.subplots(ncols=2, figsize=(17, 4), dpi=100)
plt.tight_layout()
 
df_train.groupby('target').count().location.plot(kind='pie', ax=axes[0], labels=['Not Disaster (57%)', 'Disaster (43%)'])
sns.countplot(x=df_train['target'], hue=df_train['target'], ax=axes[1])
 
axes[0].set_ylabel('')
axes[1].set_ylabel('')
axes[1].set_xticklabels(['Not Disaster (4342)', 'Disaster (3271)'])
axes[0].tick_params(axis='x', labelsize=15)
axes[0].tick_params(axis='y', labelsize=15)
axes[1].tick_params(axis='x', labelsize=15)
axes[1].tick_params(axis='y', labelsize=15)
 
axes[0].set_title('Target Distribution in Training Set', fontsize=13)
axes[1].set_title('Target Count in Training Set', fontsize=13)
 
plt.show()
```
 
%% Output
 
 
%% Cell type:code id:da95b9ad tags:
 
``` python
print('# Null values in labels:',sum(df_train.target.isnull()))
```
 
%% Output
 
# Null values in labels: 0
 
%% Cell type:code id:240a2782 tags:
 
``` python
print('# Null values in keyword:',np.mean(df_train.keyword.isnull()))
```
 
%% Output
 
# Null values in keyword: 0.008012610009194798
 
%% Cell type:code id:d2f9d060 tags:
 
``` python
print("Train shape:", df_train.shape)
print("Test shape:", df_test.shape)
```
 
%% Output
 
Train shape: (7613, 4)
Test shape: (3263, 3)
 
%% Cell type:code id:ef7ca4f0 tags:
 
``` python
 
for df in [df_train, df_test]:
for col in df.columns:
print(col,np.round(sum(df[col].isna())/len(df_train),4),'na value')
print()
```
 
%% Output
 
keyword 0.008 na value
location 0.3327 na value
text 0.0 na value
target 0.0 na value
keyword 0.0034 na value
location 0.1451 na value
text 0.0 na value
 
%% Cell type:code id:858a9ec3 tags:
 
``` python
df_train.keyword.fillna("unknown",inplace=True)
df_train.location.fillna("unknown",inplace=True)
df_test.keyword.fillna("unknown",inplace=True)
df_test.location.fillna("unknown",inplace=True)
```
 
%% Cell type:code id:6d26ea90 tags:
 
``` python
top20_loc = df_train.groupby(['location']).location.count().sort_values(ascending=False)[:10]
plt.figure(figsize = (8,6))
sns.barplot(x = top20_loc, y = top20_loc.index);
plt.xlabel('number of tweets');
```
 
%% Output
 
 
%% Cell type:code id:b38a2b3b tags:
 
``` python
df_train['target_mean'] = df_train.groupby('keyword')['target'].transform('mean')
 
fig = plt.figure(figsize=(20, 8), dpi=75)
sns.countplot(x=df_train.sort_values(by='target_mean', ascending=False)['keyword'],
hue=df_train.sort_values(by='target_mean', ascending=False)['target'])
plt.tick_params(axis='x', labelsize=6)
plt.xticks(rotation=90, ha='right')
plt.tick_params(axis='y', labelsize=10)
plt.legend(loc=1)
plt.title('Target Distribution in Keywords')
plt.show()
df_train.drop(columns=['target_mean'], inplace=True)
```
 
%% Output
 
 
%% Cell type:code id:70115299 tags:
 
``` python
 
def remove_accents(raw_text):
raw_text = re.sub(u"[àáâãäå]", '', raw_text)
raw_text = re.sub(u"[èéêë]", '', raw_text)
raw_text = re.sub(u"[ìíîï]", '', raw_text)
raw_text = re.sub(u"[òóôõö]", '', raw_text)
raw_text = re.sub(u"[ùúûü]", '', raw_text)
raw_text = re.sub(u"[ýÿ]", '', raw_text)
raw_text = re.sub(u"[ß]", '', raw_text)
raw_text = re.sub(u"[ñ]", '', raw_text)
return raw_text
```
 
%% Cell type:code id:847d498d tags:
 
``` python
import unicodedata
def simplify(text):
try:
text = unicode(text, 'utf-8')
except NameError:
pass
text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode("utf-8")
return str(text)
```
 
%% Cell type:code id:9e357de1 tags:
 
``` python
import spacy
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import regex as re
import string
 
nltk.download('stopwords')
nltk.download('punkt')
 
nltk_st = stopwords.words('english')
```
 
%% Output
 
2022-03-17 12:57:46.335808: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/cuda/include:/usr/lib/cuda/lib64:
2022-03-17 12:57:46.335840: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
[nltk_data] Downloading package stopwords to /home/sina/nltk_data...
[nltk_data] Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/sina/nltk_data...
[nltk_data] Package punkt is already up-to-date!
 
%% Cell type:code id:83fa9415 tags:
 
``` python
 
def clean(tweet, http = True, punc = True, lem = True, stop_w = True):
 
if http is True:
tweet = re.sub("https?:\/\/t.co\/[A-Za-z0-9]*", '', tweet)
 
tweet = [word.lower() for word in word_tokenize(tweet) if not word.lower() in nltk_st]
tweet = ' '.join(tweet)
 
 
tweet = [word for word in word_tokenize(tweet) if len(word) > 3]
tweet = ' '.join(tweet)
 
 
# lemmitizing
if lem == True:
lemmatized = [word.lemma_ for word in sp(tweet)]
tweet = ' '.join(lemmatized)
 
# punctuation removal
if punc is True:
tweet = tweet.translate(str.maketrans('', '', string.punctuation))
 
tweet = remove_accents(tweet)
tweet = simplify(tweet)
 
# removing extra space
tweet = re.sub("\s+", ' ', tweet)
 
return tweet
```
 
%% Cell type:code id:acd8d3ae tags:
 
``` python
df_train['cleaned_text'] = df_train.text.apply(lambda x: clean(x, lem = False, stop_w = 'nltk', http = True, punc = True))
```
 
%% Cell type:code id:b5a41220 tags:
 
``` python
 
 
from wordcloud import WordCloud
 
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 500 , width = 800 , height = 400).generate(" ".join(df_train[df_train.target == 1].cleaned_text))
plt.xlabel("real-disaster")
plt.imshow(wc , interpolation = 'bilinear')
 
```
 
%% Output
 
<matplotlib.image.AxesImage at 0x7fb8de326820>