In [None]:
"""Imports"""
import pandas as pd
"""
  read the dataset
"""

from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/Colab Notebooks/Studia/pjn/project

raw_data = pd.read_csv("train.csv")

# drop rows with NaN values
raw_data = raw_data.dropna(axis=0).reset_index(drop=True)
raw_data.head(10)

Mounted at /content/drive
/content/drive/My Drive/Colab Notebooks/Studia/pjn/project


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
6,6,13,14,Should I buy tiago?,What keeps childern active and far from phone ...,0
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
8,8,17,18,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",0
9,9,19,20,Motorola (company): Can I hack my Charter Moto...,How do I hack Motorola DCX3400 for free internet?,0


In [None]:
import re
import nltk

from tqdm.notebook import tqdm
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer, TreebankWordTokenizer
from nltk.tokenize.casual import casual_tokenize
from nltk.util import ngrams
from typing import List, Tuple

nltk.download("stopwords")
nltk.download("wordnet")

class Tokenizer:
  punctuation = "!\"#$%&'()*+, -./:;<=>?@[\]^_`{|}~"
  eng_ascii = list(range(65, 91)) + list(range(97, 123))
  usual_encodings = list(range(33, 126))
  contractions = {
    "'m": "am",
    "'s": "is",
    "'n": "and",
    "'re": "are",
    "n't": "not",
    "'d": "would",
    "'ll": "will",
    "'ve": "have",
    "'em": "them",
    "e'en": "even",
    "e'er": "ever",
    "cuz": "because",
    "cap'n": "captain",
    "a'ight": "alright",
    "'cause": "because",
    "'gainst": "against",
  }

  stopwords = nltk.corpus.stopwords.words("english")
  tokenizer = TreebankWordTokenizer()
  lemmatizer = WordNetLemmatizer()

  def _preprocess_sentences(self, sentences: List[str]) -> List[List[str]]:
    """
    Data processing entrypoint
    # consider one for loop with tqdm 
    """
    try:
      for idx, sentence in tqdm(enumerate(sentences)):
        # cleaning
        sentence = self._tokenize_treebank(sentence)
        sentence = self._resolve_contractions(sentence)
        sentence = self._remove_stopwords(sentence, False) # True | False
        # normalization
        sentence = self._case_folding(sentence) # all | first_only
        sentence = self._lemmatize(sentence) # True | False


        sentence_ngrammed = self._create_ngrams(sentence)

        sentences[idx] = sentence_ngrammed
    except Exception:
      print(sentence)

    return sentences

  def _get_empty_ids(self, sentences: List[List[str]], threshold: int = 1):
    """
    Get indexes of sentences of len < that provided as threshold variable
    """
    empty_ids = []
    print(f"Tokenizer - looking for sentences of len < {threshold}")
    for idx, el in tqdm(enumerate(sentences)):
      if len(el) < threshold:
        empty_ids.append(idx)
    print(f"Tokenizer - there are {len(empty_ids)} sentences of len < {threshold}")
    return empty_ids

  def _delete_empty_sentences(self, sentences: List[List[str]], empty_ids: List[int]):
    """
    Delete sentences which indexes are on empty_ids list
    """
    print(f"Tokenizer - deleting sentences")
    for idx in tqdm(sorted(empty_ids, reverse=True)):
      del sentences[idx]

  def _tokenize_treebank(self, sentence: str, min_len: int = 2) -> List[str]:
    """
    1a, 1b
    Tokenizes one sentence, is used by _tokenize_sentences
    """
    min_token_len = 2
    sentence = self.tokenizer.tokenize(sentence)
    # remove punctuation tokens
    sentence = [word for word in sentence if word not in self.punctuation]
    # remove trailing punctuation chars from the tokens
    sentence = [word[:-1] if word.endswith(tuple(self.punctuation)) else word for word in sentence]
    # remove fully not alphanumeric words
    sentence = [token for token in sentence if any([ch.isalnum() for ch in token])] # str.isalnum() is true for chineese シ, but 47<ord(ch)<123 could be to agressive
    # remove chars of ascii value not in [33;125] if any char (or at least 2) in token is a english [65;90] or [97;122]
    for idx, token in enumerate(sentence):
      if any([True for ch in token if ord(ch) in self.eng_ascii]):
        token = "".join([ch for ch in token if ord(ch) in self.eng_ascii])
      sentence[idx] = token
    # as last - remove tokens of len==1
    sentence = [token for token in sentence if len(token) >= min_token_len]
    return sentence

  def _resolve_contractions(self, sentence: List[str]) -> List[str]:
    """
    1c
    """
    contr_keys = list(self.contractions.keys())
    sentence = [self.contractions[el] if el in contr_keys else el for el in sentence]
    return sentence

  def _remove_stopwords(self, sentence: List[str], run_job: bool = True) -> List[str]:
    """
    1d
    Delete stopwords from given tokenized sentence
    """
    if run_job:
      sentence = [token for token in sentence if token.lower() not in self.stopwords]
    return sentence

  def _case_folding(self, sentence: List[str], param: str = "all") -> List[str]:
    """
    1e
    some information is often communicated by capitalization of a word — 
        for example, 'doctor' and 'Doctor' often have different meanings.
    hint: better approach for case normalization is to lowercase only the first
        word of a sentence and allow all other words to retain their capitalization
    """
    if param == "first_only":
      # lowercase first token in sentence if token is not an abbreviation
      sentence[0] = sentence[0].lower() if not sentence[0].isupper() else sentence[0]
    elif param == "all":
      # lowercase all tokens
      sentence = [token.lower() for token in sentence]

    return sentence

  def _lemmatize(self, sentence: List[str], run_job: bool = True) -> List[str]:
    """
    1f
    """
    if run_job:
      sentence = [self.lemmatizer.lemmatize(token) for token in sentence]
    return sentence
  

  def _create_ngrams(self, sentence: List[str], ngrams_size=1) -> List[Tuple[str]]:
    """
    2a
    """
    sentence_ngrams = list(ngrams(sentence, ngrams_size))
    sentence_ngrams = [" ".join(x) for x in sentence_ngrams]
    return sentence_ngrams

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Dataset preparation
- don't remove stopwords 
- remove tokenized question pairs which contain of questions of len < 4 tokens

In [None]:
import copy
from itertools import chain

raw_question_1 = raw_data.question1.tolist()
raw_question_2 = raw_data.question2.tolist()
labels = raw_data.is_duplicate.tolist()

# Process data: tokenize question and create lexicon
tokenizer = Tokenizer()
tokenized_question_1 = tokenizer._preprocess_sentences(copy.copy(raw_question_1))
tokenized_question_2 = tokenizer._preprocess_sentences(copy.copy(raw_question_2))

0it [00:00, ?it/s]

0it [00:00, ?it/s]

### ⭐Get ids of short sentences

In [None]:
"""Get ids of questions with len < threshold""" 
tokenizer = Tokenizer()

empty_ids = []
threshold = 4
empty_ids.extend(tokenizer._get_empty_ids(tokenized_question_1, threshold=threshold))
empty_ids.extend(tokenizer._get_empty_ids(tokenized_question_2, threshold=threshold))
print(f"Amount of all ids: {len(empty_ids)}")
empty_ids = list(set(empty_ids))
print(len(empty_ids))

Tokenizer - looking for sentences of len < 4


0it [00:00, ?it/s]

Tokenizer - there are 4575 sentences of len < 4
Tokenizer - looking for sentences of len < 4


0it [00:00, ?it/s]

Tokenizer - there are 4749 sentences of len < 4
Amount of all ids: 9324
8087


In [None]:
"""Examine empty ids"""
from IPython.display import display, HTML
display(raw_data.iloc[sorted(empty_ids[:3])])
for idx in sorted(empty_ids[:3]):
  print("idx: ", idx)
  print(tokenized_question_1[idx])
  print(tokenized_question_2[idx])
  print("")

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
6,6,13,14,Should I buy tiago?,What keeps childern active and far from phone ...,0
65545,65545,113738,113739,What is over urbanization?,What is urbanization?,0
229398,229400,285024,338684,What is substitution?,What is a good replacement for muscovado sugar?,0


idx:  6
['should', 'buy', 'tiago']
['what', 'keep', 'childern', 'active', 'and', 'far', 'from', 'phone', 'and', 'video', 'game']

idx:  65545
['what', 'is', 'over', 'urbanization']
['what', 'is', 'urbanization']

idx:  229398
['what', 'is', 'substitution']
['what', 'is', 'good', 'replacement', 'for', 'muscovado', 'sugar']



### ⭐ remove short sentences

In [None]:
"""tmp save last sentences of question1 and 2
    Note: they could be deleted if are too short, but in current case they must stay for len threshold = 4"""
tmp1 = tokenized_question_1[-1]
tmp2 = tokenized_question_2[-1]

# Delete questions of empty ids
tokenizer = Tokenizer()
tokenizer._delete_empty_sentences(tokenized_question_1, empty_ids)
tokenizer._delete_empty_sentences(tokenized_question_2, empty_ids)
tokenizer._delete_empty_sentences(labels, empty_ids)

"""Compare lenghts before and after deletion"""
print("original len: ", len(raw_question_1))
print("after removal should be: ", len(raw_question_1) - len(empty_ids))
print("q1 actual len: ", len(tokenized_question_1))
print("q2 actual len: ", len(tokenized_question_2))
print("labels actual len: ", len(labels))

"""Check if last sentences were preserved"""
assert tmp1 == tokenized_question_1[-1], "sentences are different"
assert tmp2 == tokenized_question_2[-1], "sentences are different"

"""check empty ids again, list must be empty"""
empty_ids = []
empty_ids.extend(tokenizer._get_empty_ids(tokenized_question_1, threshold=threshold))
empty_ids.extend(tokenizer._get_empty_ids(tokenized_question_2, threshold=threshold))
print(f"Amount of all ids: {len(empty_ids)}")
empty_ids = list(set(empty_ids))
print(len(empty_ids))
assert len(empty_ids) == 0, f"Empty ids list after removal is not empty, len: {len(empty_ids)}"

"""run last check"""
def last_check(sentences):
  for idx, sentence in tqdm(enumerate(sentences)):
    if not type(sentence) is list:
      print(f"Sentence {sentence} of id -- {idx}-- is {type(sentence)}")
    if not len(sentence) >= threshold:
      print(f"Sentence {sentence} of id -- {idx}-- len is {len(sentence)}")

last_check(tokenized_question_1)
last_check(tokenized_question_2)

Tokenizer - deleting sentences


  0%|          | 0/8087 [00:00<?, ?it/s]

Tokenizer - deleting sentences


  0%|          | 0/8087 [00:00<?, ?it/s]

Tokenizer - deleting sentences


  0%|          | 0/8087 [00:00<?, ?it/s]

original len:  404287
after removal should be:  396200
q1 actual len:  396200
q2 actual len:  396200
labels actual len:  396200
Tokenizer - looking for sentences of len < 4


0it [00:00, ?it/s]

Tokenizer - there are 0 sentences of len < 4
Tokenizer - looking for sentences of len < 4


0it [00:00, ?it/s]

Tokenizer - there are 0 sentences of len < 4
Amount of all ids: 0
0


0it [00:00, ?it/s]

0it [00:00, ?it/s]

### ⭐ Save processed dataset

In [None]:
# save
df_processed = pd.DataFrame({"question1": tokenized_question_1, "question2": tokenized_question_2, "is_duplicate": labels})
df_processed.to_parquet("processed_train.parquet")
# load
df_processed = pd.read_parquet("processed_train.parquet")
df_processed.head(6)

Unnamed: 0,question1,question2,is_duplicate
0,"[what, is, the, step, by, step, guide, to, inv...","[what, is, the, step, by, step, guide, to, inv...",0
1,"[what, is, the, story, of, kohinoor, kohinoor,...","[what, would, happen, if, the, indian, governm...",0
2,"[how, can, increase, the, speed, of, my, inter...","[how, can, internet, speed, be, increased, by,...",0
3,"[why, am, mentally, very, lonely, how, can, so...","[find, the, remainder, when, math, 23, 24, mat...",0
4,"[which, one, dissolve, in, water, quikly, suga...","[which, fish, would, survive, in, salt, water]",0
5,"[astrology, am, capricorn, sun, cap, moon, and...","[triple, capricorn, sun, moon, and, ascendant,...",1


### ⭐lexicon creation ⭐
before ascii check: 104938
before changing usual_encodings to eng_ascii: 102350

In [None]:
# Create lexicon
lexicon = sorted(set(chain.from_iterable(tokenized_question_1 + tokenized_question_2)))
lexicon_len = len(lexicon)
print(f"Lexicon len is {lexicon_len}")

Lexicon len is 91260


### ❌ some problems still stay

In [None]:
lexicon[-730:-700]

['|778',
 '|8969',
 '|\\alpha',
 '|\\leq',
 '|\\sin',
 '|a',
 '|a*b|ab',
 '|a+b|=|a-b',
 '|a-b',
 '|a_0|+|a_1|',
 '|a_i',
 '|avector*bvector|=avector.bvector',
 '|bc|=|ca|=|ab',
 '|f',
 '|k-i/3+i|=5^1/2/5',
 '|r',
 '|x',
 '|x+2',
 '|x+3',
 '|x+y|+|w+z|=10',
 '|x-e',
 '|x-y',
 '|xy',
 '|x|',
 '|x|^2',
 '|x|^3',
 '|y|=2x+3',
 '|z',
 '|z-1|+|z-5',
 '|z-2|=re']

# TFIDF vectors

### ⭐Load dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/Colab Notebooks/Studia/pjn/project

import pandas as pd
df_processed = pd.read_parquet("processed_train.parquet")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/Colab Notebooks/Studia/pjn/project


In [None]:
tokenized_question_1 = df_processed.question1.tolist()
tokenized_question_2 = df_processed.question2.tolist()
labels = df_processed.is_duplicate.tolist()

print("q1 actual len: ", len(tokenized_question_1))
print("q2 actual len: ", len(tokenized_question_2))
print("labels actual len: ", len(labels))

q1 actual len:  396200
q2 actual len:  396200
labels actual len:  396200


### ⭐ lexicon creation

In [None]:
from itertools import chain

lexicon = sorted(set(chain.from_iterable(tokenized_question_1 + tokenized_question_2)))
lexicon_len = len(lexicon)
print(f"Lexicon len is {lexicon_len}")

Lexicon len is 91260


In [None]:
from collections import OrderedDict

zero_vector = OrderedDict((token, 0) for token in lexicon)

In [None]:
from collections import Counter

# create TF vectors
def create_tf_vecs(sentences, tokenize=False):
  tf_vectors = []
  for idx, sentence in tqdm(enumerate(sentences)):
    vec = copy.copy(zero_vector)
    token_counts = Counter(sentence)
    for key, value in token_counts.items():
      vec[key] = value / len(lexicon)
    tf_vectors.append(vec)
  return tf_vectors

tf_question_1 = create_tf_vecs(tokenized_question_1[:10])
tf_question_2 = create_tf_vecs(tokenized_question_2[:10])
print(len(tf_question_1))
print(len(tf_question_2))

0it [00:00, ?it/s]

0it [00:00, ?it/s]

10
10


In [None]:
def token_sentences(lexicon: List[str], all_sentences: List[List[str]]):
  vec = copy.copy(zero_vector)
  for token in tqdm(lexicon):
    sentences_containing_token = sum([1 for sent in all_sentences if token in sent])
    vec[token] = sentences_containing_token
  return vec

lexicon_token_to_num_sentences = token_sentences(lexicon, tokenized_question_1 + tokenized_question_2)

### ⭐implement cosine sim

In [None]:
import math
def cosine_sim(vec1, vec2):
  dot_prod = 0
  for i, v in enumerate(vec1):
    dot_prod += v * vec2[i]

  mag_1 = math.sqrt(sum([x**2 for x in vec1]))
  mag_2 = math.sqrt(sum([x**2 for x in vec2]))

  return dot_prod / (mag_1 * mag_2)

### ⭐ Use sklearn for TFIDF

In [None]:
"""transform tokenized questions into str sentences"""
str_question_1 = [" ".join(sentence) for sentence in tokenized_question_1]
str_question_2 = [" ".join(sentence) for sentence in tokenized_question_2]
print(len(str_question_1))
print(len(str_question_2))
print(len(lexicon))

396200
396200
91260


In [None]:
"""train vectorizer"""
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words="english")
model = vectorizer.fit_transform(str_question_1 + str_question_2)

In [None]:
from tqdm.notebook import tqdm
from sklearn.metrics import confusion_matrix

def get_general_metrics(y_true: list, y_pred: list):
    """
    confusion matrix whose i-th row and j-th column entry indicates the number 
    of samples with true label being i-th class and predicted label being j-th class.
    'tn': true negatives
    'fp': false positives
    'fn': false negatives
    'tp': true positives
    """
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[False,True]).ravel()
    return tn, fp, fn, tp

def predict_full_dataset(str_question_1, str_question_2, threshold):
  print(f"Predict - threshold = {threshold}")
  preds = []
  for idx in tqdm(range(len(str_question_1))):
    tfidf_question1 = list(vectorizer.transform([str_question_1[idx]]).toarray()[0])
    tfidf_question2 = list(vectorizer.transform([str_question_2[idx]]).toarray()[0])
    similarity = cosine_sim(tfidf_question1, tfidf_question2)
    if similarity > threshold:
      preds.append(1)
    else:
      preds.append(0)
  return preds

def evaluate_similarity(str_question_1, str_question_2, labels, threshold=0.5):
  preds = predict_full_dataset(str_question_1, str_question_2, threshold)
  tn, fp, fn, tp = get_general_metrics(labels, preds)
  return tn, fp, fn, tp

In [None]:
samples_no = 10000

In [None]:
threshold = 0.4
tn, fp, fn, tp = evaluate_similarity(
    str_question_1[:samples_no], 
    str_question_2[:samples_no], 
    labels[:samples_no], 
    threshold)

print(f"Threshold: {threshold}")
print(f"correctly predicted not duplicates: {tn}")
print(f"incorrectly predicted not duplicates: {fp}")
print(f"incorrectly predicted duplicates: {fn}")
print(f"correctly predicted duplicates: {tp}")

Predict - threshold = 0.4


  0%|          | 0/10000 [00:00<?, ?it/s]

  return dot_prod / (mag_1 * mag_2)


Threshold: 0.4
correctly predicted not duplicates: 2824
incorrectly predicted not duplicates: 3438
incorrectly predicted duplicates: 426
correctly predicted duplicates: 3312


In [None]:
threshold = 0.5
tn, fp, fn, tp = evaluate_similarity(
    str_question_1[:samples_no], 
    str_question_2[:samples_no], 
    labels[:samples_no], 
    threshold)

print(f"Threshold: {threshold}")
print(f"correctly predicted not duplicates: {tn}")
print(f"incorrectly predicted not duplicates: {fp}")
print(f"incorrectly predicted duplicates: {fn}")
print(f"correctly predicted duplicates: {tp}")

  0%|          | 0/10000 [00:00<?, ?it/s]

  return dot_prod / (mag_1 * mag_2)


Threshold: 0.5
correctly predicted not duplicates: 3535
incorrectly predicted not duplicates: 2727
incorrectly predicted duplicates: 800
correctly predicted duplicates: 2938


In [None]:
"""sklearn cosine similarity"""
from sklearn.metrics.pairwise import cosine_similarity

def get_general_metrics(y_true: list, y_pred: list):
    """
    confusion matrix whose i-th row and j-th column entry indicates the number 
    of samples with true label being i-th class and predicted label being j-th class.
    'tn': true negatives
    'fp': false positives
    'fn': false negatives
    'tp': true positives
    """
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[False,True]).ravel()
    return tn, fp, fn, tp

def predict_full_dataset(str_question_1, str_question_2, threshold):
  print(f"Predict - threshold = {threshold}")
  preds = []
  for idx in tqdm(range(len(str_question_1))):
    tfidf_question1 = list(vectorizer.transform([str_question_1[idx]]).toarray()[0])
    tfidf_question2 = list(vectorizer.transform([str_question_2[idx]]).toarray()[0])
    similarity = cosine_similarity([tfidf_question1], [tfidf_question2])
    if similarity > threshold:
      preds.append(1)
    else:
      preds.append(0)
  return preds

def evaluate_similarity(str_question_1, str_question_2, labels, threshold=0.5):
  preds = predict_full_dataset(str_question_1, str_question_2, threshold)
  tn, fp, fn, tp = get_general_metrics(labels, preds)
  return tn, fp, fn, tp

In [None]:
threshold = 0.6
tn, fp, fn, tp = evaluate_similarity(
    str_question_1[:samples_no], 
    str_question_2[:samples_no], 
    labels[:samples_no], 
    threshold)

print(f"Threshold: {threshold}")
print(f"correctly predicted not duplicates: {tn}")
print(f"incorrectly predicted not duplicates: {fp}")
print(f"incorrectly predicted duplicates: {fn}")
print(f"correctly predicted duplicates: {tp}")

Predict - threshold = 0.6


  0%|          | 0/10000 [00:00<?, ?it/s]

Threshold: 0.6
correctly predicted not duplicates: 4203
incorrectly predicted not duplicates: 2059
incorrectly predicted duplicates: 1265
correctly predicted duplicates: 2473


In [None]:
threshold = 0.7
tn, fp, fn, tp = evaluate_similarity(
    str_question_1[:samples_no], 
    str_question_2[:samples_no], 
    labels[:samples_no], 
    threshold)

print(f"Threshold: {threshold}")
print(f"correctly predicted not duplicates: {tn}")
print(f"incorrectly predicted not duplicates: {fp}")
print(f"incorrectly predicted duplicates: {fn}")
print(f"correctly predicted duplicates: {tp}")

Predict - threshold = 0.7


  0%|          | 0/10000 [00:00<?, ?it/s]

Threshold: 0.7
correctly predicted not duplicates: 4733
incorrectly predicted not duplicates: 1529
incorrectly predicted duplicates: 1824
correctly predicted duplicates: 1914
