Final Score: (Late-Submission) image.png https://www.kaggle.com/c/hta-tagging/notebooks

In [ ]:
import pandas as pd
from tqdm import tqdm

import unicodedata, re, itertools, sys

def remove_control_chars(the_string):
    all_chars = (chr(i) for i in range(sys.maxunicode))
    categories = {'Cc'}
    control_chars = ''.join(c for c in all_chars if unicodedata.category(c) in categories)
    # or equivalently and much more efficiently
    control_chars = ''.join(map(chr, itertools.chain(range(0x00,0x20), range(0x7f,0xa0))))
    
    control_char_re = re.compile('[%s]' % re.escape(control_chars))
    return control_char_re.sub('', the_string)

def reforge_aiat_dataset(file_target, output_csv_name):
    import pandas
    df = pandas.read_csv(file_target)
    row_n = len(df.index)
    df['text'] = ""
    for x in tqdm(range(0,len(df))):
        folder_name = df['Filename'][x][:5]
        file_name = df['Filename'][x]
        target_location = "train-data/" + folder_name + "/" + file_name
        f = open(target_location, "r", encoding="utf8")
        text_data = f.read()
        df['text'][x] = remove_control_chars(text_data.encode('ascii', 'ignore').decode('utf-8').replace('\n',' '))
        f.close()
    df.to_csv(output_csv_name, index=False)

reforge_aiat_dataset("train.csv","reforge_train_set.csv")

df = pd.read_csv("reforge_train_set.csv")

def reforge_aiat_test_dataset(file_target, output_csv_name):
    import pandas
    df = pandas.read_csv(file_target)
    row_n = len(df.index)
    df['text'] = ""
    for x in tqdm(range(0,len(df))):
        folder_name = df['Id'][x][:5]
        file_name = df['Id'][x]
        target_location = "test-data/test-data/" + folder_name + "/" + file_name
        f = open(target_location, "r", encoding="utf8", errors='ignore')
        text_data = f.read()
        df['text'][x] = remove_control_chars(text_data.encode('ascii', 'ignore').decode('utf-8').replace('\n',' '))
        f.close()
    df.to_csv(output_csv_name, index=False)

reforge_aiat_test_dataset('test.csv','reforge_test_set.csv')

df.head()
In [ ]:
 
In [85]:
import pandas as pd
In [2]:
data_df = pd.read_csv("reforge_train_set.csv")
data_df.head()
Out[2]:
Filename Blinding of intervention Blinding of Outcome assessment Classes text
0 00060-02.txt P P PP A Multicomponent Intervention To Prevent Major...
1 00060-03.txt N N NN Original article Self-management versus conven...
2 00060-04.txt N N NN Original Research Home Management of Oral Anti...
3 00060-05.txt P P PP For personal use only. Not to be reproduced wi...
4 00060-06.txt P P PP 2016 Pozzi et al. This work is published and ...
In [3]:
data_df.count()
Out[3]:
Filename                          628
Blinding of intervention          628
Blinding of Outcome assessment    628
Classes                           628
text                              628
dtype: int64
In [4]:
#Check for duplication
dv_duplicate = data_df['text'].count() - data_df['text'].nunique()
dv_duplicate
Out[4]:
44
In [5]:
data_df.drop_duplicates(['text'], inplace=True)
In [6]:
#Check for duplication
data_df['text'].count() - data_df['text'].nunique()
Out[6]:
0
In [7]:
#Check more correctness: N/A Data Tuple
data_df.isna().sum()
Out[7]:
Filename                          0
Blinding of intervention          0
Blinding of Outcome assessment    0
Classes                           0
text                              0
dtype: int64
In [8]:
data_df.count()
Out[8]:
Filename                          584
Blinding of intervention          584
Blinding of Outcome assessment    584
Classes                           584
text                              584
dtype: int64

Preprocess

In [9]:
def sentence_langdetect(dataframe, sentencecolumn):
    # set seed
    DetectorFactory.seed = 0

    # hold label - language
    languages = []

    # go through each text
    for ii in tqdm(range(0,len(dataframe))):
        # split by space into list, take the first x in text, join with space
        text = dataframe.iloc[ii][sentencecolumn].split(" ")
    
        lang = "en"
        try:
            if len(text) > 50:
                lang = detect(" ".join(text[:50]))
            elif len(text) > 0:
                lang = detect(" ".join(text[:len(text)]))
        # ught... beginning of the document was not in a good format
        except Exception as e:
            all_words = set(text)
            try:
                lang = detect(" ".join(all_words))
            # what!! :( let's see if we can find any text in abstract...
            except Exception as e:
                lang = "unknown"
                pass
    
        # get the language    
        languages.append(lang)
        
    languages_dict = {}
    for lang in set(languages):
        languages_dict[lang] = languages.count(lang)
    print("Report of Detected Language:")
    print(languages_dict)
    return languages
In [10]:
from tqdm import tqdm
import nltk
from langdetect import detect
from langdetect import DetectorFactory
In [11]:
data_languages = sentence_langdetect(data_df, 'text')
100%|████████████████████████████████████████████████████████████████████████████████| 584/584 [00:05<00:00, 99.39it/s]
Report of Detected Language:
{'de': 3, 'pt': 1, 'es': 2, 'en': 576, 'fr': 1, 'it': 1}

In [12]:
data_df['language'] = data_languages
In [13]:
data_df = data_df[data_df['language'] == 'en']
del data_df['language']
In [15]:
dv_foreign = 584 - len(data_df)
dv_foreign
Out[15]:
8
In [16]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_sci_lg
import string
from nltk.corpus import stopwords
In [17]:
punctuations = string.punctuation
stopwords = list(STOP_WORDS)
custom_stop_words_academicpaper = [
    'doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure', 
    'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 
    'al.', 'Elsevier', 'PMC', 'CZI', 'www'
]

for w in custom_stop_words_academicpaper:
    if w not in stopwords:
        stopwords.append(w)
In [18]:
parser = en_core_sci_lg.load(disable=["tagger", "ner"])
parser.max_length = 7000000

def clean_spacy_tokenizer(dirty):
    mytokens = parser(dirty)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens
In [19]:
tqdm.pandas()
data_df['text'] = data_df['text'].progress_apply(clean_spacy_tokenizer)
C:\ProgramData\Anaconda3\lib\site-packages\tqdm\std.py:668: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version
  from pandas import Panel
100%|████████████████████████████████████████████████████████████████████████████████| 576/576 [03:51<00:00,  2.49it/s]
In [20]:
data_df.head()
Out[20]:
Filename Blinding of intervention Blinding of Outcome assessment Classes text
0 00060-02.txt P P PP multicomponent intervention prevent major blee...
1 00060-03.txt N N NN original article self-management versus conven...
2 00060-04.txt N N NN original research home management oral anticoa...
3 00060-05.txt P P PP personal use reproduce lancet articles lancet ...
4 00060-06.txt P P PP 2016 pozzi work publish dove medical press lim...
In [21]:
data_df['word_count'] = data_df['text'].apply(lambda x: len(x.strip().split()))
data_df['unique_words'] = data_df['text'].apply(lambda x:len(set(str(x).split())))
In [22]:
import seaborn as sns
In [23]:
sns.distplot(data_df['word_count'])
data_df['word_count'].describe()
Out[23]:
count      576.000000
mean      2834.859375
std       3591.818308
min         32.000000
25%        988.000000
50%       2503.500000
75%       3781.000000
max      46017.000000
Name: word_count, dtype: float64
In [24]:
sns.distplot(data_df['unique_words'])
data_df['unique_words'].describe()
Out[24]:
count     576.000000
mean      944.343750
std       733.762321
min        25.000000
25%       493.750000
50%       958.000000
75%      1309.750000
max      6598.000000
Name: unique_words, dtype: float64
In [ ]:
width=0.5

strong_dis = np.array((20, 10, 5, 10, 15))
disagree = np.array((20, 25, 15, 15, 10))
# shortcut here
therest = np.subtract(100, strong_dis + disagree)

q = np.arange(5)

bsd=plt.barh(q, strong_dis, width, color='red')
bd=plt.barh(q, disagree, width, left=strong_dis, color='pink')
br=plt.barh(q, therest, width, left=strong_dis+disagree, color='lightblue')

ylabels = tuple(reversed(['A', 'B', 'C', 'D', 'E']))
plt.yticks(q+width/2., ylabels)

plt.xlabel('Responses (%)')
plt.legend((bsd, bd, br), ('strong disagree', 'disagree', 'the rest'))
plt.show()
In [73]:
N = 2
menMeans = ((dv_duplicate*100)/628, 0)
womenMeans = ((dv_foreign*100)/628,0)
transMeans = (0, ((628-(dv_foreign + dv_duplicate))*100)/628)
ind = np.arange(N)    # the x locations for the groups
width = 0.5       # the width of the bars: can also be len(x) sequence

p1 = plt.bar(ind, menMeans, width)
p2 = plt.bar(ind, womenMeans, width, bottom=menMeans)
p3 = plt.bar(ind, transMeans, width, bottom=womenMeans)

plt.ylabel('Scores')
plt.title('Scores by group and gender')
plt.xticks(ind, ('Inconsistent', 'Consistent'))
plt.yticks(np.arange(0, 101, 10))
plt.legend((p1, p2, p3), ('Men', 'Women', 'Freeabie'))

plt.show()

Feature Engineering

In [107]:
data_df.groupby('Blinding of intervention').size()
Out[107]:
Blinding of intervention
N    122
P    217
Q    237
dtype: int64
In [108]:
def blinding_label(row, targetcolumn):
    if (row[targetcolumn] == 'N'):
        return 0
    if (row[targetcolumn] == 'P'):
        return 1
    if (row[targetcolumn] == 'Q'):
        return 2
    return -1 #In case of invalid input detection
In [109]:
def blinding_positive(row, targetcolumn):
    if (row[targetcolumn] == 'N'):
        return 0
    if (row[targetcolumn] == 'P'):
        return 1
    if (row[targetcolumn] == 'Q'):
        return 0
    return -1 #In case of invalid input detection
In [110]:
def blinding_negative(row, targetcolumn):
    if (row[targetcolumn] == 'N'):
        return 1
    if (row[targetcolumn] == 'P'):
        return 0
    if (row[targetcolumn] == 'Q'):
        return 0
    return -1 #In case of invalid input detection
In [111]:
def blinding_question(row, targetcolumn):
    if (row[targetcolumn] == 'N'):
        return 0
    if (row[targetcolumn] == 'P'):
        return 0
    if (row[targetcolumn] == 'Q'):
        return 1
    return -1 #In case of invalid input detection
In [74]:
len(data_df)
Out[74]:
576
In [112]:
data_df.groupby('Classes').size()
Out[112]:
Classes
NN     79
NP     12
NQ     31
PP    207
PQ     10
QN      1
QP     18
QQ    218
dtype: int64
In [113]:
def pair_of_blinding_label(row, targetcolumn):
    if (row[targetcolumn] == 'NN'):
        return 0
    if (row[targetcolumn] == 'NP'):
        return 1
    if (row[targetcolumn] == 'NQ'):
        return 2
    if (row[targetcolumn] == 'PP'):
        return 3
    if (row[targetcolumn] == 'PQ'):
        return 4
    if (row[targetcolumn] == 'QN'):
        return 5
    if (row[targetcolumn] == 'QP'):
        return 6
    if (row[targetcolumn] == 'QQ'):
        return 7
    if (row[targetcolumn] == 'PN'):
        return 8
    return -1 #In case of invalid input detection
In [114]:
def extract_feature(extraction, dataframe, targetcolumn):
    if 'extracted_feature' in dataframe:
        ovewriting = 1
    else:
        ovewriting = 0
    dataframe['extracted_feature'] = dataframe.apply (lambda row: extraction(row, targetcolumn), axis=1)
    if ovewriting == 1:
        result = "Extraction complete and did overwrite on latest 'extracted_feature'"
    elif ovewriting == 0:
        result = "Extraction complete"
    if -1 in dataframe['extracted_feature'].unique():
        result = "ERR: Some or all record of this dataframe of feature cannot be extract with this extraction, or user may make an incorrect call"
        del dataframe['extracted_feature']
        if ovewriting == 1:
            result = result + "\nNOTE: Your 'extracted_feature' is corrupted and has been removed"
    return print(result)
In [117]:
extract_feature(blinding_label, data_df, "Blinding of intervention")
data_df['BoI_Class'] = data_df['extracted_feature']
del data_df['extracted_feature']

extract_feature(blinding_label, data_df, "Blinding of Outcome assessment")
data_df['BoA_Class'] = data_df['extracted_feature']
del data_df['extracted_feature']

extract_feature(pair_of_blinding_label, data_df, "Classes")
data_df['Pair_Class'] = data_df['extracted_feature']
del data_df['extracted_feature']

extract_feature(blinding_positive, data_df, "Blinding of intervention")
data_df['BoI_P'] = data_df['extracted_feature']
del data_df['extracted_feature']

extract_feature(blinding_positive, data_df, "Blinding of Outcome assessment")
data_df['BoA_P'] = data_df['extracted_feature']
del data_df['extracted_feature']

extract_feature(blinding_negative, data_df, "Blinding of intervention")
data_df['BoI_N'] = data_df['extracted_feature']
del data_df['extracted_feature']

extract_feature(blinding_negative, data_df, "Blinding of Outcome assessment")
data_df['BoA_N'] = data_df['extracted_feature']
del data_df['extracted_feature']

extract_feature(blinding_question, data_df, "Blinding of intervention")
data_df['BoI_Q'] = data_df['extracted_feature']
del data_df['extracted_feature']

extract_feature(blinding_question, data_df, "Blinding of Outcome assessment")
data_df['BoA_Q'] = data_df['extracted_feature']
del data_df['extracted_feature']
Extraction complete
Extraction complete
Extraction complete
Extraction complete
Extraction complete
Extraction complete
Extraction complete
Extraction complete
Extraction complete
In [118]:
data_df.sample(n=8, random_state=4)
Out[118]:
Filename Blinding of intervention Blinding of Outcome assessment Classes text word_count unique_words BoI_Class BoA_Class Pair_Class BoI_P BoA_P BoI_N BoA_N BoI_Q BoA_Q
67 00069-01.txt Q Q QQ 2013 http://informahealthcare.com/jmf issn 147... 2149 897 2 2 7 0 0 0 0 1 1
333 00126-05.txt N N NN comparison effect honey dextromethorphan diphe... 3098 1349 0 0 0 0 0 1 1 0 0
420 00153-09.txt Q Q QQ quality life patients endometrial cancer under... 3658 1221 2 2 7 0 0 0 0 1 1
307 00121-03.txt N N NN use factor ix complex warfarin-related intracr... 7281 1405 0 0 0 0 0 1 1 0 0
519 00171-12.txt Q Q QQ abstract 46 chronic schizophrenic outpatient m... 132 98 2 2 7 0 0 0 0 1 1
216 00097-18.txt Q Q QQ minilaparotomy laparoscopy sterilization multi... 3056 1171 2 2 7 0 0 0 0 1 1
296 00108-01.txt N Q NQ research article metformin treatment type 2 di... 4877 1418 0 2 2 0 0 1 0 0 1
565 00185-39.txt Q Q QQ abstract reactogenicity immunize activity vacc... 75 57 2 2 7 0 0 0 0 1 1
In [ ]:
data_df.to_csv('data/FCNP/Train_Heavy.csv', index = False)
In [119]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
In [120]:
#Enter spliting percentage here (Sum of them must equal 100).
TrainSize = 70
ValSize = 15
TestSize = 15
In [121]:
def spliter(dataframe, labelcolumn, TrainSize, ValSize, TestSize):
    X_train=[]
    y_train=[]
    X_val=[]
    y_val=[]
    X_test=[]
    y_test=[]
    SumSize = TrainSize + ValSize + TestSize
    if (SumSize != 100):
        err = 1
    else:
        err = 0
    if err != 1:
        X = dataframe
        y = LabelEncoder().fit_transform(dataframe[labelcolumn])
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(TestSize/100), random_state=1)
        subsplit = (1/((100-TestSize)/ValSize))
        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=subsplit, random_state=1)
        X_train = X_train.reset_index(drop=True)
        X_test = X_test.reset_index(drop=True)
        X_val = X_val.reset_index(drop=True)
    elif err == 1:
        print("ERR: Dataframe spliting scale incorrect, make sure sum of them must equal to 100")
    return X_train, y_train, X_val, y_val, X_test, y_test
In [122]:
# Y is not neccesary at this step, but make sure to declare it correct
X_train, y_train, X_val, y_val, X_test, y_test = spliter(data_df, 'Blinding of intervention', TrainSize, ValSize, TestSize)
In [123]:
X_train.to_csv('data/FCNP/Train.csv', index = False)
X_val.to_csv('data/FCNP/Val.csv', index = False)
X_test.to_csv('data/FCNP/Test.csv', index = False)
In [124]:
del data_df, X_train, y_train, X_val, y_val, X_test, y_test
#Optional: For restore used memory space.
In [ ]:
 
In [1]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import pickle
from tqdm import tqdm
from langdetect import detect
from langdetect import DetectorFactory

import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
import tensorflow as tf

import pandas, xgboost, numpy, textblob, string, nltk
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

Data Loading

In [2]:
df_train = pd.read_csv("data/FCNP/Train.csv")
df_val = pd.read_csv("data/FCNP/Val.csv")
#Keep Test set for later! make it like real hackaton senario.
In [3]:
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
In [4]:
start_time = datetime.now()

# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word')
count_vect.fit(df_train['text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(df_train['text'])
xvalid_count =  count_vect.transform(df_val['text'])

# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', max_features=4096)
tfidf_vect.fit(df_train['text'])
xtrain_tfidf =  tfidf_vect.transform(df_train['text'])
xvalid_tfidf =  tfidf_vect.transform(df_val['text'])

time_elapsed = datetime.now() - start_time
print('Time elapsed (hh:mm:ss.ms) {}'.format(time_elapsed))
Time elapsed (hh:mm:ss.ms) 0:00:04.709892
In [5]:
from sklearn import model_selection, preprocessing, linear_model, metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
In [6]:
def train_model(classification, feature_vector_train, label):
    # fit the training dataset on the classifier
    classification.fit(feature_vector_train, label)
    return classification
In [7]:
def predict_model(classifier, feature_vector_valid, label):
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    return metrics.accuracy_score(predictions, label)
In [8]:
def do_classify(dataframe, Name_of_Pred, x_train, x_val, y_train, y_val):
    LR_WordTFIDF_classifier = train_model(linear_model.LogisticRegression(), x_train, y_train)
    LR_WordTFIDF_accuracy = predict_model(LR_WordTFIDF_classifier, x_val, y_val)
    LR_WordTFIDF_predictions = LR_WordTFIDF_classifier.predict(x_val)
    
    temp_df = dataframe
    temp_df[Name_of_Pred] = LR_WordTFIDF_predictions
    #temp_df = temp_df[temp_df[Name_of_Pred] != 1]
    return temp_df, LR_WordTFIDF_accuracy
In [9]:
def do_benchmark(y_pred, y_val, average_type='binary', f1_average='macro'):
    model_accuracy = accuracy_score(y_val, y_pred)
    model_precision = precision_score(y_val, y_pred, average=average_type)
    model_recall = recall_score(y_val, y_pred, average=average_type)
    model_f1 = f1_score(y_val, y_pred, average=f1_average)
    return model_accuracy, model_precision, model_recall, model_f1
In [10]:
def report_benchmark(model_name, model_acc, model_pre, model_rec, model_f1):
    print(model_name)
    print("Accuracy:  ",model_acc)
    print("Precision: ",model_pre)
    print("Recall:    ",model_rec)
    print("F1:        ",model_f1)
In [11]:
BoI_P_df_val, BoI_P_Acc = do_classify(df_val, 'Pred_BoI_P', xtrain_tfidf, xvalid_tfidf, df_train['BoI_P'], df_val['BoI_P'])
BoI_P_Acc, BoI_P_Pre, BoI_P_Rec, BoI_P_F1 = do_benchmark(BoI_P_df_val['Pred_BoI_P'], df_val['BoI_P'])

BoI_Q_df_val, BoI_Q_Acc = do_classify(df_val, 'Pred_BoI_Q', xtrain_tfidf, xvalid_tfidf, df_train['BoI_Q'], df_val['BoI_Q'])
BoI_Q_Acc, BoI_Q_Pre, BoI_Q_Rec, BoI_Q_F1 = do_benchmark(BoI_Q_df_val['Pred_BoI_Q'], df_val['BoI_Q'])

BoI_N_df_val, BoI_N_Acc = do_classify(df_val, 'Pred_BoI_N', xtrain_tfidf, xvalid_tfidf, df_train['BoI_N'], df_val['BoI_N'])
BoI_N_Acc, BoI_N_Pre, BoI_N_Rec, BoI_N_F1 = do_benchmark(BoI_N_df_val['Pred_BoI_N'], df_val['BoI_N'])
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\classification.py:1437: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\classification.py:1437: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
In [12]:
df_val['BoI_Class'].value_counts()
Out[12]:
2    44
1    31
0    12
Name: BoI_Class, dtype: int64
In [102]:
report_benchmark("BoI_P", BoI_P_Acc, BoI_P_Pre, BoI_P_Rec, BoI_P_F1)
BoI_P
Accuracy:   0.7931034482758621
Precision:  0.782608695652174
Recall:     0.5806451612903226
F1:         0.7583333333333333
In [103]:
report_benchmark("BoI_Q", BoI_Q_Acc, BoI_Q_Pre, BoI_Q_Rec, BoI_Q_F1)
BoI_Q
Accuracy:   0.6551724137931034
Precision:  0.8181818181818182
Recall:     0.4090909090909091
F1:         0.6338383838383839
In [104]:
report_benchmark("BoI_N", BoI_N_Acc, BoI_N_Pre, BoI_N_Rec, BoI_N_F1)
BoI_N
Accuracy:   0.8620689655172413
Precision:  0.0
Recall:     0.0
F1:         0.46296296296296297
In [ ]:
 
In [20]:
BoA_P_df_val, BoA_P_Acc = do_classify(df_val, 'Pred_BoA_P', xtrain_tfidf, xvalid_tfidf, df_train['BoA_P'], df_val['BoA_P'])
BoA_P_Acc, BoA_P_Pre, BoA_P_Rec, BoA_P_F1 = do_benchmark(BoA_P_df_val['Pred_BoA_P'], df_val['BoA_P'])

BoA_Q_df_val, BoA_Q_Acc = do_classify(df_val, 'Pred_BoA_Q', xtrain_tfidf, xvalid_tfidf, df_train['BoA_Q'], df_val['BoA_Q'])
BoA_Q_Acc, BoA_Q_Pre, BoA_Q_Rec, BoA_Q_F1 = do_benchmark(BoA_Q_df_val['Pred_BoA_Q'], df_val['BoA_Q'])

BoA_N_df_val, BoA_N_Acc = do_classify(df_val, 'Pred_BoA_N', xtrain_tfidf, xvalid_tfidf, df_train['BoA_N'], df_val['BoA_N'])
BoA_N_Acc, BoA_N_Pre, BoA_N_Rec, BoA_N_F1 = do_benchmark(BoA_N_df_val['Pred_BoA_N'], df_val['BoA_N'])
In [21]:
df_val['BoA_Class'].value_counts()
Out[21]:
2    40
1    38
0     9
Name: BoA_Class, dtype: int64
In [22]:
report_benchmark("BoA_P", BoA_P_Acc, BoA_P_Pre, BoA_P_Rec, BoA_P_F1)
BoA_P
Accuracy:   0.7241379310344828
Precision:  0.7916666666666666
Recall:     0.5
F1:         0.6993087557603687
In [23]:
report_benchmark("BoA_Q", BoA_Q_Acc, BoA_Q_Pre, BoA_Q_Rec, BoA_Q_F1)
BoA_Q
Accuracy:   0.6551724137931034
Precision:  0.6785714285714286
Recall:     0.475
F1:         0.6379023307436182
In [24]:
report_benchmark("BoA_N", BoA_N_Acc, BoA_N_Pre, BoA_N_Rec, BoA_N_F1)
BoA_N
Accuracy:   0.896551724137931
Precision:  0.0
Recall:     0.0
F1:         0.4727272727272727
In [25]:
BoI_N_df_val['Pred_BoA_N'].value_counts()
Out[25]:
0    87
Name: Pred_BoA_N, dtype: int64
In [116]:
def blinding_predict(row, x_p_column, x_q_column, x_n_column):
    if ((row[x_p_column] == 0) & (row[x_q_column] == 0) & (row[x_n_column] == 1)):
        return 0
    if ((row[x_p_column] == 1) & (row[x_q_column] == 0) & (row[x_n_column] == 0)):
        return 1
    if ((row[x_p_column] == 0) & (row[x_q_column] == 1) & (row[x_n_column] == 0)):
        return 2
    return -1 #In case of invalid input detection
In [117]:
def do_validation(validation, dataframe, x_p_column, x_q_column, x_n_column):
    if 'validate_result' in dataframe:
        ovewriting = 1
    else:
        ovewriting = 0
    dataframe['validate_result'] = dataframe.apply (lambda row: validation(row, x_p_column, x_q_column, x_n_column), axis=1)
    if ovewriting == 1:
        result = "Validation complete and did overwrite on latest 'validate_result'"
    elif ovewriting == 0:
        result = "Validation complete"
    #if -1 in dataframe['validate_result'].unique():
    #    result = "ERR: Some or all record of this dataframe of feature cannot be validate with this validation, or user may make an incorrect call"
    #    del dataframe['validate_result']
    #    if ovewriting == 1:
    #        result = result + "\nNOTE: Your 'validate_result' is corrupted and has been removed"
    #return print(result)
In [141]:
def order_evaluate(dataframe, BlindType, N_df, P_df, Q_df):
    temp_df = dataframe
    #df_xX <-- x = positive or negative, X = is N or P or Q
    df_pN = temp_df[temp_df.isin(N_df[N_df['Pred_'+BlindType+'_N'] == 1])].dropna()
    df_nN = temp_df[temp_df.isin(N_df[N_df['Pred_'+BlindType+'_N'] == 0])].dropna()
    
    df_nN_pQ = df_nN[temp_df.isin(N_df[N_df['Pred_'+BlindType+'_Q'] == 1])].dropna()
    df_nN_nQ = df_nN[temp_df.isin(N_df[N_df['Pred_'+BlindType+'_Q'] == 0])].dropna()
    df_nN_nQ_pP = df_nN_nQ[temp_df.isin(N_df[N_df['Pred_'+BlindType+'_Q'] != 1])].dropna()
    df_nN_nQ_pP['Pred_'+BlindType+'_P'] = 1.0
    df_NQP = pd.concat([df_pN, df_nN_pQ, df_nN_nQ_pP], ignore_index=False).sort_index()
    
    df_nN_pP = df_nN[temp_df.isin(N_df[N_df['Pred_'+BlindType+'_P'] == 1])].dropna()
    df_nN_nP = df_nN[temp_df.isin(N_df[N_df['Pred_'+BlindType+'_P'] == 0])].dropna()
    df_nN_nP_pQ = df_nN_nP[temp_df.isin(N_df[N_df['Pred_'+BlindType+'_P'] != 1])].dropna()
    df_nN_nP_pQ['Pred_'+BlindType+'_Q'] = 1.0
    df_NPQ = pd.concat([df_pN, df_nN_pP, df_nN_nP_pQ], ignore_index=False).sort_index()
    
    
    df_pP = temp_df[temp_df.isin(P_df[P_df['Pred_'+BlindType+'_P'] == 1])].dropna()
    df_nP = temp_df[temp_df.isin(P_df[P_df['Pred_'+BlindType+'_P'] == 0])].dropna()
    
    df_nP_pQ = df_nP[temp_df.isin(P_df[P_df['Pred_'+BlindType+'_Q'] == 1])].dropna()
    df_nP_nQ = df_nP[temp_df.isin(P_df[P_df['Pred_'+BlindType+'_Q'] == 0])].dropna()
    df_nP_nQ_pN = df_nP_nQ[temp_df.isin(P_df[P_df['Pred_'+BlindType+'_Q'] != 1])].dropna()
    df_nP_nQ_pN['Pred_'+BlindType+'_N'] = 1.0
    df_PQN = pd.concat([df_pP, df_nP_pQ, df_nP_nQ_pN], ignore_index=False).sort_index()
    
    df_nP_pN = df_nP[temp_df.isin(P_df[P_df['Pred_'+BlindType+'_N'] == 1])].dropna()
    df_nP_nN = df_nP[temp_df.isin(P_df[P_df['Pred_'+BlindType+'_N'] == 0])].dropna()
    df_nP_nN_pQ = df_nP_nN[temp_df.isin(P_df[P_df['Pred_'+BlindType+'_N'] != 1])].dropna()
    df_nP_nN_pQ['Pred_'+BlindType+'_Q'] = 1.0
    df_PNQ = pd.concat([df_pP, df_nP_pN, df_nP_nN_pQ], ignore_index=False).sort_index()
    
    
    df_pQ = temp_df[temp_df.isin(Q_df[Q_df['Pred_'+BlindType+'_Q'] == 1])].dropna()
    df_nQ = temp_df[temp_df.isin(Q_df[Q_df['Pred_'+BlindType+'_Q'] == 0])].dropna()
    
    df_nQ_pP = df_nQ[temp_df.isin(Q_df[Q_df['Pred_'+BlindType+'_P'] == 1])].dropna()
    df_nQ_nP = df_nQ[temp_df.isin(Q_df[Q_df['Pred_'+BlindType+'_P'] == 0])].dropna()
    df_nQ_nP_pN = df_nQ_nP[temp_df.isin(Q_df[Q_df['Pred_'+BlindType+'_P'] != 1])].dropna()
    df_nQ_nP_pN['Pred_'+BlindType+'_N'] = 1.0
    df_QPN = pd.concat([df_pQ, df_nQ_pP, df_nQ_nP_pN], ignore_index=False).sort_index()
    
    df_nQ_pN = df_nQ[temp_df.isin(Q_df[Q_df['Pred_'+BlindType+'_N'] == 1])].dropna()
    df_nQ_nN = df_nQ[temp_df.isin(Q_df[Q_df['Pred_'+BlindType+'_N'] == 0])].dropna()
    df_nQ_nN_pP = df_nQ_nN[temp_df.isin(Q_df[Q_df['Pred_'+BlindType+'_N'] != 1])].dropna()
    df_nQ_nN_pP['Pred_'+BlindType+'_P'] = 1.0
    df_QNP = pd.concat([df_pQ, df_nQ_pN, df_nQ_nN_pP], ignore_index=False).sort_index()
    
    
    
    do_validation(blinding_predict, df_NQP, 'Pred_'+BlindType+'_P', 'Pred_'+BlindType+'_Q', 'Pred_'+BlindType+'_N')
    df_NQP[''+BlindType+'_Validate'] = df_NQP['validate_result']
    del df_NQP['validate_result']
    print(''+BlindType+'_NQP: Done')
    
    do_validation(blinding_predict, df_NPQ, 'Pred_'+BlindType+'_P', 'Pred_'+BlindType+'_Q', 'Pred_'+BlindType+'_N')
    df_NPQ[''+BlindType+'_Validate'] = df_NPQ['validate_result']
    del df_NPQ['validate_result']
    print(''+BlindType+'_NPQ: Done')
    
    do_validation(blinding_predict, df_PQN, 'Pred_'+BlindType+'_P', 'Pred_'+BlindType+'_Q', 'Pred_'+BlindType+'_N')
    df_PQN[''+BlindType+'_Validate'] = df_PQN['validate_result']
    del df_PQN['validate_result']
    print(''+BlindType+'_PQN: Done')
    
    do_validation(blinding_predict, df_PNQ, 'Pred_'+BlindType+'_P', 'Pred_'+BlindType+'_Q', 'Pred_'+BlindType+'_N')
    df_PNQ[''+BlindType+'_Validate'] = df_PNQ['validate_result']
    del df_PNQ['validate_result']
    print(''+BlindType+'_PNQ: Done')
    
    do_validation(blinding_predict, df_QPN, 'Pred_'+BlindType+'_P', 'Pred_'+BlindType+'_Q', 'Pred_'+BlindType+'_N')
    df_QPN[''+BlindType+'_Validate'] = df_QPN['validate_result']
    del df_QPN['validate_result']
    print(''+BlindType+'_QPN: Done')
    
    do_validation(blinding_predict, df_QNP, 'Pred_'+BlindType+'_P', 'Pred_'+BlindType+'_Q', 'Pred_'+BlindType+'_N')
    df_QNP[''+BlindType+'_Validate'] = df_QNP['validate_result']
    del df_QNP['validate_result']
    print(''+BlindType+'_QNP: Done')
    
    return df_NQP, df_NPQ, df_PQN, df_PNQ, df_QPN, df_QNP
In [142]:
BoI_NQP_df, BoI_NPQ_df, BoI_PQN_df, BoI_PNQ_df, BoI_QPN_df, BoI_QNP_df = \
order_evaluate(df_val,"BoI", BoI_N_df_val, BoI_P_df_val, BoI_Q_df_val)
BoI_NQP: Done
BoI_NPQ: Done
BoI_PQN: Done
BoI_PNQ: Done
BoI_QPN: Done
BoI_QNP: Done
In [143]:
df_val['BoI_Class'].value_counts()
Out[143]:
2    44
1    31
0    12
Name: BoI_Class, dtype: int64
In [159]:
BoI_NQP_Acc, BoI_NQP_Pre, BoI_NQP_Rec, BoI_NQP_F1 = do_benchmark(BoI_NQP_df['BoI_Validate'], df_val['BoI_Class'], average_type='macro')
BoI_NPQ_Acc, BoI_NPQ_Pre, BoI_NPQ_Rec, BoI_NPQ_F1 = do_benchmark(BoI_NPQ_df['BoI_Validate'], df_val['BoI_Class'], average_type='macro')
BoI_PQN_Acc, BoI_PQN_Pre, BoI_PQN_Rec, BoI_PQN_F1 = do_benchmark(BoI_PQN_df['BoI_Validate'], df_val['BoI_Class'], average_type='macro')
BoI_PNQ_Acc, BoI_PNQ_Pre, BoI_PNQ_Rec, BoI_PNQ_F1 = do_benchmark(BoI_PNQ_df['BoI_Validate'], df_val['BoI_Class'], average_type='macro')
BoI_QPN_Acc, BoI_QPN_Pre, BoI_QPN_Rec, BoI_QPN_F1 = do_benchmark(BoI_QPN_df['BoI_Validate'], df_val['BoI_Class'], average_type='macro')
BoI_QNP_Acc, BoI_QNP_Pre, BoI_QNP_Rec, BoI_QNP_F1 = do_benchmark(BoI_QNP_df['BoI_Validate'], df_val['BoI_Class'], average_type='macro')
In [160]:
report_benchmark("BoI_NQP", BoI_NQP_Acc, BoI_NQP_Pre, BoI_NQP_Rec, BoI_NQP_F1), print("\n")
report_benchmark("BoI_NPQ", BoI_NPQ_Acc, BoI_NPQ_Pre, BoI_NPQ_Rec, BoI_NPQ_F1), print("\n")
report_benchmark("BoI_PQN", BoI_PQN_Acc, BoI_PQN_Pre, BoI_PQN_Rec, BoI_PQN_F1), print("\n")
report_benchmark("BoI_PNQ", BoI_PNQ_Acc, BoI_PNQ_Pre, BoI_PNQ_Rec, BoI_PNQ_F1), print("\n")
report_benchmark("BoI_QPN", BoI_QPN_Acc, BoI_QPN_Pre, BoI_QPN_Rec, BoI_QPN_F1), print("\n")
report_benchmark("BoI_QNP", BoI_QNP_Acc, BoI_QNP_Pre, BoI_QNP_Rec, BoI_QNP_F1)
BoI_NQP
Accuracy:   0.5402298850574713
Precision:  0.4214452214452215
Recall:     0.44819159335288367
F1:         0.3832070707070707


BoI_NPQ
Accuracy:   0.6781609195402298
Precision:  0.4744112318840579
Recall:     0.5041544477028348
F1:         0.4753086419753087


BoI_PQN
Accuracy:   0.5057471264367817
Precision:  0.5970889014367275
Recall:     0.5521342456826328
F1:         0.5028058361391695


BoI_PNQ
Accuracy:   0.6781609195402298
Precision:  0.4744112318840579
Recall:     0.5041544477028348
F1:         0.4753086419753087


BoI_QPN
Accuracy:   0.5057471264367817
Precision:  0.5970889014367275
Recall:     0.5521342456826328
F1:         0.5028058361391695


BoI_QNP
Accuracy:   0.5402298850574713
Precision:  0.4214452214452215
Recall:     0.44819159335288367
F1:         0.3832070707070707
In [154]:
BoA_NQP_df, BoA_NPQ_df, BoA_PQN_df, BoA_PNQ_df, BoA_QPN_df, BoA_QNP_df = \
order_evaluate(df_val,"BoA", BoA_N_df_val, BoA_P_df_val, BoA_Q_df_val)
BoA_NQP: Done
BoA_NPQ: Done
BoA_PQN: Done
BoA_PNQ: Done
BoA_QPN: Done
BoA_QNP: Done
In [ ]:
 
In [155]:
BoA_NQP_Acc, BoA_NQP_Pre, BoA_NQP_Rec, BoA_NQP_F1 = do_benchmark(BoA_NQP_df['BoA_Validate'], df_val['BoA_Class'], average_type='macro')
BoA_NPQ_Acc, BoA_NPQ_Pre, BoA_NPQ_Rec, BoA_NPQ_F1 = do_benchmark(BoA_NPQ_df['BoA_Validate'], df_val['BoA_Class'], average_type='macro')
BoA_PQN_Acc, BoA_PQN_Pre, BoA_PQN_Rec, BoA_PQN_F1 = do_benchmark(BoA_PQN_df['BoA_Validate'], df_val['BoA_Class'], average_type='macro')
BoA_PNQ_Acc, BoA_PNQ_Pre, BoA_PNQ_Rec, BoA_PNQ_F1 = do_benchmark(BoA_PNQ_df['BoA_Validate'], df_val['BoA_Class'], average_type='macro')
BoA_QPN_Acc, BoA_QPN_Pre, BoA_QPN_Rec, BoA_QPN_F1 = do_benchmark(BoA_QPN_df['BoA_Validate'], df_val['BoA_Class'], average_type='macro')
BoA_QNP_Acc, BoA_QNP_Pre, BoA_QNP_Rec, BoA_QNP_F1 = do_benchmark(BoA_QNP_df['BoA_Validate'], df_val['BoA_Class'], average_type='macro')
In [156]:
report_benchmark("BoA_NQP", BoA_NQP_Acc, BoA_NQP_Pre, BoA_NQP_Rec, BoA_NQP_F1), print("\n")
report_benchmark("BoA_NPQ", BoA_NPQ_Acc, BoA_NPQ_Pre, BoA_NPQ_Rec, BoA_NPQ_F1), print("\n")
report_benchmark("BoA_PQN", BoA_PQN_Acc, BoA_PQN_Pre, BoA_PQN_Rec, BoA_PQN_F1), print("\n")
report_benchmark("BoA_PNQ", BoA_PNQ_Acc, BoA_PNQ_Pre, BoA_PNQ_Rec, BoA_PNQ_F1), print("\n")
report_benchmark("BoA_QPN", BoA_QPN_Acc, BoA_QPN_Pre, BoA_QPN_Rec, BoA_QPN_F1), print("\n")
report_benchmark("BoA_QNP", BoA_QNP_Acc, BoA_QNP_Pre, BoA_QNP_Rec, BoA_QNP_F1)
BoA_NQP
Accuracy:   0.5632183908045977
Precision:  0.395682001614205
Recall:     0.4214912280701754
F1:         0.39246007681423084


BoA_NPQ
Accuracy:   0.6436781609195402
Precision:  0.45965608465608465
Recall:     0.47500000000000003
F1:         0.4437832759160664


BoA_PQN
Accuracy:   0.5057471264367817
Precision:  0.5472222222222222
Recall:     0.5472222222222222
F1:         0.4814846759818298


BoA_PNQ
Accuracy:   0.6436781609195402
Precision:  0.45965608465608465
Recall:     0.47500000000000003
F1:         0.4437832759160664


BoA_QPN
Accuracy:   0.5057471264367817
Precision:  0.5472222222222222
Recall:     0.5472222222222222
F1:         0.4814846759818298


BoA_QNP
Accuracy:   0.5632183908045977
Precision:  0.395682001614205
Recall:     0.4214912280701754
F1:         0.39246007681423084
In [743]:
def pair_blinding_unlabel(row, BoI_df, BoA_df):
    if ((row[BoI_df] == 0) & (row[BoA_df] == 0)):
        return 'NN'
    elif ((row[BoI_df] == 0) & (row[BoA_df] == 1)):
        return 'NP'
    elif ((row[BoI_df] == 0) & (row[BoA_df] == 2)):
        return 'NQ'
    elif ((row[BoI_df] == 1) & (row[BoA_df] == 1)):
        return 'PP'
    elif ((row[BoI_df] == 1) & (row[BoA_df] == 2)):
        return 'PQ'
    elif ((row[BoI_df] == 2) & (row[BoA_df] == 0)):
        return 'QN'
    elif ((row[BoI_df] == 2) & (row[BoA_df] == 1)):
        return 'QP'
    elif ((row[BoI_df] == 2) & (row[BoA_df] == 2)):
        return 'QQ'
    elif ((row[BoI_df] == 1) & (row[BoA_df] == 0)):
        return 'PN'
    #Real next generation of Intuition (//omg, burn me. lol):
    elif (((row[BoI_df] == 0) & (row[BoA_df] == -1)) | ((row[BoI_df] == -1) & (row[BoA_df] == 0))):
        return 'NN'
    elif (((row[BoI_df] == 1) & (row[BoA_df] == -1)) | ((row[BoI_df] == -1) & (row[BoA_df] == 1))):
        return 'PP'
    elif (((row[BoI_df] == 2) & (row[BoA_df] == -1)) | ((row[BoI_df] == -1) & (row[BoA_df] == 2))):
        return 'QQ'
    #Most unbeliveable
    elif ((row[BoI_df] == -1) & (row[BoA_df] == -1)):
        return 'QQ'
    return -1 #In case of invalid input detection
In [744]:
def do_pairing(pairing, dataframe, BoI_df, BoA_df):
    dataframe['pairing_result'] = dataframe.apply (lambda row: pairing(row, BoI_df, BoA_df), axis=1)
    result = "Pairing complete"
    return print(result)
In [745]:
Val_result_df = pd.DataFrame()
Val_result_df['Filename'] = BoI_PNQ_df['Filename']
Val_result_df['BoI_Validate'] = BoI_PNQ_df['BoI_Validate']
Val_result_df['BoA_Validate'] = BoA_PNQ_df['BoA_Validate']
do_pairing(pair_blinding_unlabel, Val_result_df, 'BoI_Validate', 'BoA_Validate')
Pairing complete
In [746]:
#Val_result_df[Val_result_df['BoA_Validate']==0]
In [747]:
Val_result_df['Id'] = Val_result_df['Filename']
Val_result_df['Predication'] = Val_result_df['pairing_result']
del Val_result_df['BoI_Validate'], Val_result_df['BoA_Validate'], Val_result_df['pairing_result'], Val_result_df['Filename']

Val_result_df.head()
Out[747]:
Id Predication
0 00170-10.txt QQ
1 00129-07.txt QQ
2 00069-05.txt QQ
3 00060-19.txt QQ
4 00121-02.txt QQ
In [748]:
Val_result_df.to_csv('data/FCNP/Val_Submit.csv', index = False)
In [749]:
def score_pairing(row, BoI_df, BoA_df):
    if ((row[BoI_df] == 'NN') & (row[BoA_df] == 'NN')):
        return 1
    elif ((row[BoI_df] == 'NP') & (row[BoA_df] == 'NP')):
        return 1
    elif ((row[BoI_df] == 'NQ') & (row[BoA_df] == 'NQ')):
        return 1
    elif ((row[BoI_df] == 'PP') & (row[BoA_df] == 'PP')):
        return 1
    elif ((row[BoI_df] == 'PQ') & (row[BoA_df] == 'PQ')):
        return 1
    elif ((row[BoI_df] == 'QN') & (row[BoA_df] == 'QN')):
        return 1
    elif ((row[BoI_df] == 'QP') & (row[BoA_df] == 'QP')):
        return 1
    elif ((row[BoI_df] == 'QQ') & (row[BoA_df] == 'QQ')):
        return 1
    elif ((row[BoI_df] == 'PN') & (row[BoA_df] == 'PN')):
        return 1
    return 0 #In case of invalid input detection
In [750]:
Val_result_df['y'] = df_val['Classes']
do_pairing(score_pairing, Val_result_df, 'Predication', 'y')
Pairing complete
In [751]:
Val_result_df['Score'] = Val_result_df['pairing_result']
del Val_result_df['y'], Val_result_df['pairing_result']

Val_result_df.head()
Out[751]:
Id Predication Score
0 00170-10.txt QQ 1
1 00129-07.txt QQ 0
2 00069-05.txt QQ 1
3 00060-19.txt QQ 0
4 00121-02.txt QQ 0
In [752]:
def accurary_pair_result(dataframe, y_column):
    sum_score = dataframe[dataframe[y_column] == 1][y_column].count()
    accuracy = (sum_score / dataframe[y_column].count()) * 100
    return accuracy
In [753]:
accurary_pair_result(Val_result_df, 'Score')
Out[753]:
66.66666666666666

Now operate with Test Set

In [ ]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import pickle
from tqdm import tqdm
from langdetect import detect
from langdetect import DetectorFactory

import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
import tensorflow as tf

import pandas, xgboost, numpy, textblob, string, nltk
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
In [754]:
test_data_df = pd.read_csv("reforge_test_set.csv")
test_data_df.head()
Out[754]:
Id Prediction text
0 00002-02.txt NaN Abstract Objective: To assess the effect of pr...
1 00002-04.txt NaN Abstract BACKGROUND: Steroid pre-treatments ma...
2 00002-06.txt NaN effect accross all trials. The pooled results ...
3 00002-07.txt NaN Human Reproduction vol.11 no.5 pp. 1035-1037, ...
4 00002-08.txt NaN 576 Progestogen therapy during pituitary desen...
In [755]:
del test_data_df['Prediction']
In [756]:
punctuations = string.punctuation
stopwords = list(STOP_WORDS)
custom_stop_words_academicpaper = [
    'doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure', 
    'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 
    'al.', 'Elsevier', 'PMC', 'CZI', 'www'
]

for w in custom_stop_words_academicpaper:
    if w not in stopwords:
        stopwords.append(w)
In [757]:
parser = en_core_sci_lg.load(disable=["tagger", "ner"])
parser.max_length = 7000000

def clean_spacy_tokenizer(dirty):
    mytokens = parser(dirty)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens
In [758]:
tqdm.pandas()
test_data_df['text'] = test_data_df['text'].progress_apply(clean_spacy_tokenizer)
C:\ProgramData\Anaconda3\lib\site-packages\tqdm\std.py:668: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version
  from pandas import Panel
100%|████████████████████████████████████████████████████████████████████████████████| 395/395 [02:53<00:00,  2.27it/s]
In [759]:
test_data_df.to_csv('test.csv', index = False)

Loading Preprocessed Data

In [760]:
df_test = pd.read_csv("test.csv")
In [761]:
start_time = datetime.now()

# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word')
count_vect.fit(df_train['text'])

# transform the training and validation data using count vectorizer object
xtest_count =  count_vect.transform(df_test['text'])

# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', max_features=4096)
tfidf_vect.fit(df_train['text'])
xtrain_tfidf =  tfidf_vect.transform(df_train['text'])
xtest_tfidf =  tfidf_vect.transform(df_test['text'])

time_elapsed = datetime.now() - start_time
print('Time elapsed (hh:mm:ss.ms) {}'.format(time_elapsed))
Time elapsed (hh:mm:ss.ms) 0:00:05.703725
In [762]:
def do_classify_testset(dataframe, Name_of_Pred, x_train, x_test, y_train):
    LR_WordTFIDF_classifier = train_model(linear_model.LogisticRegression(), x_train, y_train)
    LR_WordTFIDF_predictions = LR_WordTFIDF_classifier.predict(x_test)
    temp_df = dataframe
    temp_df[Name_of_Pred] = LR_WordTFIDF_predictions
    return temp_df
In [763]:
BoI_P_df_test = do_classify_testset(df_test, 'Pred_BoI_P', xtrain_tfidf, xtest_tfidf, df_train['BoI_P'])
BoI_Q_df_test = do_classify_testset(df_test, 'Pred_BoI_Q', xtrain_tfidf, xtest_tfidf, df_train['BoI_Q'])
BoI_N_df_test = do_classify_testset(df_test, 'Pred_BoI_N', xtrain_tfidf, xtest_tfidf, df_train['BoI_N'])
BoA_P_df_test = do_classify_testset(df_test, 'Pred_BoA_P', xtrain_tfidf, xtest_tfidf, df_train['BoA_P'])
BoA_Q_df_test = do_classify_testset(df_test, 'Pred_BoA_Q', xtrain_tfidf, xtest_tfidf, df_train['BoA_Q'])
BoA_N_df_test = do_classify_testset(df_test, 'Pred_BoA_N', xtrain_tfidf, xtest_tfidf, df_train['BoA_N'])
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
In [765]:
def testset_blinding_predict(row, p_column, q_column, n_column):
    if ((row[p_column] == 0) & (row[q_column] == 0) & (row[n_column] == 1)):
        return 0
    if ((row[p_column] == 1) & (row[q_column] == 0) & (row[n_column] == 0)):
        return 1
    if ((row[p_column] == 0) & (row[q_column] == 1) & (row[n_column] == 0)):
        return 2
    return -1 #In case of invalid input detection
In [771]:
def testset_do_validation(validation, dataframe, p_column, q_column, n_column):
    if 'validate_result' in dataframe:
        ovewriting = 1
    else:
        ovewriting = 0
    dataframe['validate_result'] = dataframe.apply (lambda row: validation(row, p_column, q_column, n_column), axis=1)
    if ovewriting == 1:
        result = "Validation complete and did overwrite on latest 'validate_result'"
    elif ovewriting == 0:
        result = "Validation complete"
In [772]:
def evaluate_testset(dataframe, BlindType, N_df, P_df, Q_df):
    temp_df = dataframe
    #df_xX <-- x = positive or negative, X = is N or P or Q
    df_pN = temp_df[temp_df.isin(N_df[N_df['Pred_'+BlindType+'_N'] == 1])].dropna()
    df_nN = temp_df[temp_df.isin(N_df[N_df['Pred_'+BlindType+'_N'] == 0])].dropna()
    
    df_nN_pQ = df_nN[temp_df.isin(N_df[N_df['Pred_'+BlindType+'_Q'] == 1])].dropna()
    df_nN_nQ = df_nN[temp_df.isin(N_df[N_df['Pred_'+BlindType+'_Q'] == 0])].dropna()
    df_nN_nQ_pP = df_nN_nQ[temp_df.isin(N_df[N_df['Pred_'+BlindType+'_Q'] != 1])].dropna()
    df_nN_nQ_pP['Pred_'+BlindType+'_P'] = 1.0
    df_NQP = pd.concat([df_pN, df_nN_pQ, df_nN_nQ_pP], ignore_index=False).sort_index()
    
    df_nN_pP = df_nN[temp_df.isin(N_df[N_df['Pred_'+BlindType+'_P'] == 1])].dropna()
    df_nN_nP = df_nN[temp_df.isin(N_df[N_df['Pred_'+BlindType+'_P'] == 0])].dropna()
    df_nN_nP_pQ = df_nN_nQ[temp_df.isin(N_df[N_df['Pred_'+BlindType+'_P'] != 1])].dropna()
    df_nN_nP_pQ['Pred_'+BlindType+'_Q'] = 1.0
    df_NPQ = pd.concat([df_pN, df_nN_pP, df_nN_nP_pQ], ignore_index=False).sort_index()
    
    
    df_pP = temp_df[temp_df.isin(P_df[P_df['Pred_'+BlindType+'_P'] == 1])].dropna()
    df_nP = temp_df[temp_df.isin(P_df[P_df['Pred_'+BlindType+'_P'] == 0])].dropna()
    
    df_nP_pQ = df_nP[temp_df.isin(P_df[P_df['Pred_'+BlindType+'_Q'] == 1])].dropna()
    df_nP_nQ = df_nP[temp_df.isin(P_df[P_df['Pred_'+BlindType+'_Q'] == 0])].dropna()
    df_nP_nQ_pN = df_nP_nQ[temp_df.isin(P_df[P_df['Pred_'+BlindType+'_Q'] != 1])].dropna()
    df_nP_nQ_pN['Pred_'+BlindType+'_N'] = 1.0
    df_PQN = pd.concat([df_pP, df_nP_pQ, df_nP_nQ_pN], ignore_index=False).sort_index()
    
    df_nP_pN = df_nP[temp_df.isin(P_df[P_df['Pred_'+BlindType+'_N'] == 1])].dropna()
    df_nP_nN = df_nP[temp_df.isin(P_df[P_df['Pred_'+BlindType+'_N'] == 0])].dropna()
    df_nP_nN_pQ = df_nP_nN[temp_df.isin(P_df[P_df['Pred_'+BlindType+'_N'] != 1])].dropna()
    df_nP_nN_pQ['Pred_'+BlindType+'_Q'] = 1.0
    df_PNQ = pd.concat([df_pP, df_nP_pN, df_nP_nN_pQ], ignore_index=False).sort_index()
    
    
    df_pQ = temp_df[temp_df.isin(Q_df[Q_df['Pred_'+BlindType+'_Q'] == 1])].dropna()
    df_nQ = temp_df[temp_df.isin(Q_df[Q_df['Pred_'+BlindType+'_Q'] == 0])].dropna()
    
    df_nQ_pP = df_nQ[temp_df.isin(Q_df[Q_df['Pred_'+BlindType+'_P'] == 1])].dropna()
    df_nQ_nP = df_nQ[temp_df.isin(Q_df[Q_df['Pred_'+BlindType+'_P'] == 0])].dropna()
    df_nQ_nP_pN = df_nQ_nP[temp_df.isin(Q_df[Q_df['Pred_'+BlindType+'_P'] != 1])].dropna()
    df_nQ_nP_pN['Pred_'+BlindType+'_N'] = 1.0
    df_QPN = pd.concat([df_pQ, df_nQ_pP, df_nQ_nP_pN], ignore_index=False).sort_index()
    
    df_nQ_pN = df_nQ[temp_df.isin(Q_df[Q_df['Pred_'+BlindType+'_N'] == 1])].dropna()
    df_nQ_nN = df_nQ[temp_df.isin(Q_df[Q_df['Pred_'+BlindType+'_N'] == 0])].dropna()
    df_nQ_nN_pP = df_nQ_nN[temp_df.isin(Q_df[Q_df['Pred_'+BlindType+'_N'] != 1])].dropna()
    df_nQ_nN_pP['Pred_'+BlindType+'_P'] = 1.0
    df_QNP = pd.concat([df_pQ, df_nQ_pN, df_nQ_nN_pP], ignore_index=False).sort_index()
    
    
    
    testset_do_validation(testset_blinding_predict, df_NQP, 'Pred_'+BlindType+'_P', 'Pred_'+BlindType+'_Q', 'Pred_'+BlindType+'_N')
    df_NQP[''+BlindType+'_Validate'] = df_NQP['validate_result']
    del df_NQP['validate_result']
    print(''+BlindType+'_NQP: Done')
    
    testset_do_validation(testset_blinding_predict, df_NPQ, 'Pred_'+BlindType+'_P', 'Pred_'+BlindType+'_Q', 'Pred_'+BlindType+'_N')
    df_NPQ[''+BlindType+'_Validate'] = df_NPQ['validate_result']
    del df_NPQ['validate_result']
    print(''+BlindType+'_NPQ: Done')
    
    testset_do_validation(testset_blinding_predict, df_PQN, 'Pred_'+BlindType+'_P', 'Pred_'+BlindType+'_Q', 'Pred_'+BlindType+'_N')
    df_PQN[''+BlindType+'_Validate'] = df_PQN['validate_result']
    del df_PQN['validate_result']
    print(''+BlindType+'_PQN: Done')
    
    testset_do_validation(testset_blinding_predict, df_PNQ, 'Pred_'+BlindType+'_P', 'Pred_'+BlindType+'_Q', 'Pred_'+BlindType+'_N')
    df_PNQ[''+BlindType+'_Validate'] = df_PNQ['validate_result']
    del df_PNQ['validate_result']
    print(''+BlindType+'_PNQ: Done')
    
    testset_do_validation(testset_blinding_predict, df_QPN, 'Pred_'+BlindType+'_P', 'Pred_'+BlindType+'_Q', 'Pred_'+BlindType+'_N')
    df_QPN[''+BlindType+'_Validate'] = df_QPN['validate_result']
    del df_QPN['validate_result']
    print(''+BlindType+'_QPN: Done')
    
    testset_do_validation(testset_blinding_predict, df_QNP, 'Pred_'+BlindType+'_P', 'Pred_'+BlindType+'_Q', 'Pred_'+BlindType+'_N')
    df_QNP[''+BlindType+'_Validate'] = df_QNP['validate_result']
    del df_QNP['validate_result']
    print(''+BlindType+'_QNP: Done')
    
    return df_NQP, df_NPQ, df_PQN, df_PNQ, df_QPN, df_QNP
In [773]:
BoI_NQP_df_test, BoI_NPQ_df_test, BoI_PQN_df_test, BoI_PNQ_df_test, BoI_QPN_df_test, BoI_QNP_df_test = \
evaluate_testset(df_test,"BoI", BoI_N_df_test, BoI_P_df_test, BoI_Q_df_test)
BoI_NQP: Done
BoI_NPQ: Done
BoI_PQN: Done
BoI_PNQ: Done
BoI_QPN: Done
BoI_QNP: Done
In [774]:
BoA_NQP_df_test, BoA_NPQ_df_test, BoA_PQN_df_test, BoA_PNQ_df_test, BoA_QPN_df_test, BoA_QNP_df_test = \
evaluate_testset(df_test,"BoA", BoA_N_df_test, BoA_P_df_test, BoA_Q_df_test)
BoA_NQP: Done
BoA_NPQ: Done
BoA_PQN: Done
BoA_PNQ: Done
BoA_QPN: Done
BoA_QNP: Done
In [786]:
def instant_testing(CombineCode, BoI_NQP_df_test, BoI_NPQ_df_test, BoI_PQN_df_test, BoI_PNQ_df_test, BoI_QPN_df_test, BoI_QNP_df_test,BoA_NQP_df_test, BoA_NPQ_df_test, BoA_PQN_df_test, BoA_PNQ_df_test, BoA_QPN_df_test, BoA_QNP_df_test):
    Test_result_df = pd.DataFrame()
    Test_result_df['Id'] = vars()['BoI_'+CombineCode+'_df_test']['Id']
    Test_result_df['BoI_Validate'] = vars()['BoI_'+CombineCode+'_df_test']['BoI_Validate']
    Test_result_df['BoA_Validate'] = vars()['BoA_'+CombineCode+'_df_test']['BoA_Validate']
    do_pairing(pair_blinding_unlabel, Test_result_df, 'BoI_Validate', 'BoA_Validate')
    #Test_result_df['Id'] = Test_result_df['Id']
    Test_result_df['Prediction'] = Test_result_df['pairing_result']
    del Test_result_df['BoI_Validate'], Test_result_df['BoA_Validate']
    del Test_result_df['pairing_result']
    Test_result_df.to_csv('data/FCNP/Submit2/Test_Submit_'+CombineCode+'.csv', index = False)
    print("Export Complete")
    del Test_result_df
In [787]:
instant_testing('NQP', BoI_NQP_df_test, BoI_NPQ_df_test, BoI_PQN_df_test, BoI_PNQ_df_test, BoI_QPN_df_test, BoI_QNP_df_test,BoA_NQP_df_test, BoA_NPQ_df_test, BoA_PQN_df_test, BoA_PNQ_df_test, BoA_QPN_df_test, BoA_QNP_df_test)
Pairing complete
Export Complete
In [788]:
instant_testing('NPQ', BoI_NQP_df_test, BoI_NPQ_df_test, BoI_PQN_df_test, BoI_PNQ_df_test, BoI_QPN_df_test, BoI_QNP_df_test,BoA_NQP_df_test, BoA_NPQ_df_test, BoA_PQN_df_test, BoA_PNQ_df_test, BoA_QPN_df_test, BoA_QNP_df_test)
Pairing complete
Export Complete
In [789]:
instant_testing('PQN', BoI_NQP_df_test, BoI_NPQ_df_test, BoI_PQN_df_test, BoI_PNQ_df_test, BoI_QPN_df_test, BoI_QNP_df_test,BoA_NQP_df_test, BoA_NPQ_df_test, BoA_PQN_df_test, BoA_PNQ_df_test, BoA_QPN_df_test, BoA_QNP_df_test)
Pairing complete
Export Complete
In [790]:
instant_testing('PNQ', BoI_NQP_df_test, BoI_NPQ_df_test, BoI_PQN_df_test, BoI_PNQ_df_test, BoI_QPN_df_test, BoI_QNP_df_test,BoA_NQP_df_test, BoA_NPQ_df_test, BoA_PQN_df_test, BoA_PNQ_df_test, BoA_QPN_df_test, BoA_QNP_df_test)
Pairing complete
Export Complete
In [791]:
instant_testing('QPN', BoI_NQP_df_test, BoI_NPQ_df_test, BoI_PQN_df_test, BoI_PNQ_df_test, BoI_QPN_df_test, BoI_QNP_df_test,BoA_NQP_df_test, BoA_NPQ_df_test, BoA_PQN_df_test, BoA_PNQ_df_test, BoA_QPN_df_test, BoA_QNP_df_test)
Pairing complete
Export Complete
In [792]:
instant_testing('QNP', BoI_NQP_df_test, BoI_NPQ_df_test, BoI_PQN_df_test, BoI_PNQ_df_test, BoI_QPN_df_test, BoI_QNP_df_test,BoA_NQP_df_test, BoA_NPQ_df_test, BoA_PQN_df_test, BoA_PNQ_df_test, BoA_QPN_df_test, BoA_QNP_df_test)
Pairing complete
Export Complete
In [ ]:
 
In [ ]:
 

Heavy Training Version

In [689]:
df_train_heavy = pd.read_csv("data/FCNP/Train_Heavy.csv")
df_test = pd.read_csv("test.csv")
In [690]:
start_time = datetime.now()

# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word')
count_vect.fit(df_train_heavy['text'])

# transform the training and validation data using count vectorizer object
xtest_count =  count_vect.transform(df_test['text'])

# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', max_features=4096)
tfidf_vect.fit(df_train_heavy['text'])
xtrainheavy_tfidf =  tfidf_vect.transform(df_train_heavy['text'])
xtest_tfidf =  tfidf_vect.transform(df_test['text'])

time_elapsed = datetime.now() - start_time
print('Time elapsed (hh:mm:ss.ms) {}'.format(time_elapsed))
Time elapsed (hh:mm:ss.ms) 0:00:06.978988
In [694]:
BoI_P_df_test = do_classify_testset(df_test, 'Pred_BoI_P', xtrainheavy_tfidf, xtest_tfidf, df_train_heavy['BoI_P'])
BoI_Q_df_test = do_classify_testset(df_test, 'Pred_BoI_Q', xtrainheavy_tfidf, xtest_tfidf, df_train_heavy['BoI_Q'])
BoI_N_df_test = do_classify_testset(df_test, 'Pred_BoI_N', xtrainheavy_tfidf, xtest_tfidf, df_train_heavy['BoI_N'])
BoA_P_df_test = do_classify_testset(df_test, 'Pred_BoA_P', xtrainheavy_tfidf, xtest_tfidf, df_train_heavy['BoA_P'])
BoA_Q_df_test = do_classify_testset(df_test, 'Pred_BoA_Q', xtrainheavy_tfidf, xtest_tfidf, df_train_heavy['BoA_Q'])
BoA_N_df_test = do_classify_testset(df_test, 'Pred_BoA_N', xtrainheavy_tfidf, xtest_tfidf, df_train_heavy['BoA_N'])
In [695]:
BoI_NQP_df_test, BoI_NPQ_df_test, BoI_PQN_df_test, BoI_PNQ_df_test, BoI_QPN_df_test, BoI_QNP_df_test = \
evaluate_testset(df_test,"BoI", BoI_N_df_test, BoI_P_df_test, BoI_Q_df_test)
In [696]:
BoA_NQP_df_test, BoA_NPQ_df_test, BoA_PQN_df_test, BoA_PNQ_df_test, BoA_QPN_df_test, BoA_QNP_df_test = \
evaluate_testset(df_test,"BoA", BoA_N_df_test, BoA_P_df_test, BoA_Q_df_test)
In [704]:
extract_feature(blinding_label, df_train_heavy, "Blinding of intervention")
df_train_heavy['BoI_Class'] = df_train_heavy['extracted_feature']
del df_train_heavy['extracted_feature']

extract_feature(blinding_label, df_train_heavy, "Blinding of Outcome assessment")
df_train_heavy['BoA_Class'] = df_train_heavy['extracted_feature']
del df_train_heavy['extracted_feature']

extract_feature(pair_of_blinding_label, df_train_heavy, "Classes")
df_train_heavy['Pair_Class'] = df_train_heavy['extracted_feature']
del df_train_heavy['extracted_feature']

extract_feature(blinding_positive, df_train_heavy, "Blinding of intervention")
df_train_heavy['BoI_P'] = df_train_heavy['extracted_feature']
del df_train_heavy['extracted_feature']

extract_feature(blinding_positive, df_train_heavy, "Blinding of Outcome assessment")
df_train_heavy['BoA_P'] = df_train_heavy['extracted_feature']
del df_train_heavy['extracted_feature']

extract_feature(blinding_negative, df_train_heavy, "Blinding of intervention")
df_train_heavy['BoI_N'] = df_train_heavy['extracted_feature']
del df_train_heavy['extracted_feature']

extract_feature(blinding_negative, df_train_heavy, "Blinding of Outcome assessment")
df_train_heavy['BoA_N'] = df_train_heavy['extracted_feature']
del df_train_heavy['extracted_feature']

extract_feature(blinding_question, df_train_heavy, "Blinding of intervention")
df_train_heavy['BoI_Q'] = df_train_heavy['extracted_feature']
del df_train_heavy['extracted_feature']

extract_feature(blinding_question, df_train_heavy, "Blinding of Outcome assessment")
df_train_heavy['BoA_Q'] = df_train_heavy['extracted_feature']
del df_train_heavy['extracted_feature']
Extraction complete
Extraction complete
Extraction complete
Extraction complete
Extraction complete
Extraction complete
Extraction complete
Extraction complete
Extraction complete
In [707]:
BoI_NQP_df_test
Out[707]:
Id text Pred_BoI_P Pred_BoI_Q Pred_BoI_N Pred_BoA_P Pred_BoA_Q Pred_BoA_N
0 00002-02.txt abstract objective assess effect pretreatment ... 1.0 0.0 0.0 0.0 1.0 0.0
1 00002-04.txt abstract background steroid pre-treatment usef... 1.0 0.0 0.0 0.0 1.0 0.0
2 00002-06.txt effect accross trial pool result odd ratio cru... 1.0 0.0 0.0 0.0 1.0 0.0
3 00002-07.txt human reproduction vol.11 no.5 pp 1035 1037 19... 0.0 0.0 1.0 0.0 1.0 0.0
4 00002-08.txt 576 progestogen therapy pituitary desensitizat... 1.0 0.0 0.0 0.0 1.0 0.0
... ... ... ... ... ... ... ... ...
390 00053-26.txt original article nottingham trial faecal occul... 1.0 0.0 0.0 0.0 0.0 0.0
391 00053-27.txt baseline findings italian multicenter randomiz... 0.0 1.0 0.0 0.0 1.0 0.0
392 00053-28.txt 1310 articles jnci vol 103 issue 17 september ... 1.0 0.0 0.0 0.0 0.0 0.0
393 00053-29.txt acceptability flexible sigmoidoscopy screen ol... 1.0 0.0 0.0 0.0 0.0 0.0
394 00053-30.txt population-based surveillance colonoscopy effe... 1.0 0.0 0.0 0.0 1.0 0.0

395 rows × 8 columns

In [795]:
def instant_testing(CombineCode, BoI_NQP_df_test, BoI_NPQ_df_test, BoI_PQN_df_test, BoI_PNQ_df_test, BoI_QPN_df_test, BoI_QNP_df_test,BoA_NQP_df_test, BoA_NPQ_df_test, BoA_PQN_df_test, BoA_PNQ_df_test, BoA_QPN_df_test, BoA_QNP_df_test):
    Test_result_df = pd.DataFrame()
    Test_result_df['Id'] = vars()['BoI_'+CombineCode+'_df_test']['Id']
    Test_result_df['BoI_Validate'] = vars()['BoI_'+CombineCode+'_df_test']['BoI_Validate']
    Test_result_df['BoA_Validate'] = vars()['BoA_'+CombineCode+'_df_test']['BoA_Validate']
    do_pairing(pair_blinding_unlabel, Test_result_df, 'BoI_Validate', 'BoA_Validate')
    #Test_result_df['Id'] = Test_result_df['Id']
    Test_result_df['Prediction'] = Test_result_df['pairing_result']
    del Test_result_df['BoI_Validate'], Test_result_df['BoA_Validate']
    del Test_result_df['pairing_result']
    Test_result_df.to_csv('data/FCNP/Submit2/Test_Submit_'+CombineCode+'_heavy.csv', index = False)
    print("Export Complete")
    del Test_result_df
In [796]:
instant_testing('NQP', BoI_NQP_df_test, BoI_NPQ_df_test, BoI_PQN_df_test, BoI_PNQ_df_test, BoI_QPN_df_test, BoI_QNP_df_test,BoA_NQP_df_test, BoA_NPQ_df_test, BoA_PQN_df_test, BoA_PNQ_df_test, BoA_QPN_df_test, BoA_QNP_df_test)
Pairing complete
Export Complete
In [797]:
instant_testing('NPQ', BoI_NQP_df_test, BoI_NPQ_df_test, BoI_PQN_df_test, BoI_PNQ_df_test, BoI_QPN_df_test, BoI_QNP_df_test,BoA_NQP_df_test, BoA_NPQ_df_test, BoA_PQN_df_test, BoA_PNQ_df_test, BoA_QPN_df_test, BoA_QNP_df_test)
Pairing complete
Export Complete
In [798]:
instant_testing('PQN', BoI_NQP_df_test, BoI_NPQ_df_test, BoI_PQN_df_test, BoI_PNQ_df_test, BoI_QPN_df_test, BoI_QNP_df_test,BoA_NQP_df_test, BoA_NPQ_df_test, BoA_PQN_df_test, BoA_PNQ_df_test, BoA_QPN_df_test, BoA_QNP_df_test)
Pairing complete
Export Complete
In [799]:
instant_testing('PNQ', BoI_NQP_df_test, BoI_NPQ_df_test, BoI_PQN_df_test, BoI_PNQ_df_test, BoI_QPN_df_test, BoI_QNP_df_test,BoA_NQP_df_test, BoA_NPQ_df_test, BoA_PQN_df_test, BoA_PNQ_df_test, BoA_QPN_df_test, BoA_QNP_df_test)
Pairing complete
Export Complete
In [800]:
instant_testing('QPN', BoI_NQP_df_test, BoI_NPQ_df_test, BoI_PQN_df_test, BoI_PNQ_df_test, BoI_QPN_df_test, BoI_QNP_df_test,BoA_NQP_df_test, BoA_NPQ_df_test, BoA_PQN_df_test, BoA_PNQ_df_test, BoA_QPN_df_test, BoA_QNP_df_test)
Pairing complete
Export Complete
In [801]:
instant_testing('QNP', BoI_NQP_df_test, BoI_NPQ_df_test, BoI_PQN_df_test, BoI_PNQ_df_test, BoI_QPN_df_test, BoI_QNP_df_test,BoA_NQP_df_test, BoA_NPQ_df_test, BoA_PQN_df_test, BoA_PNQ_df_test, BoA_QPN_df_test, BoA_QNP_df_test)
Pairing complete
Export Complete
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 

Reveal Score on Test:

In [512]:
BoI_P_df_test, BoI_P_Acc = do_classify(df_test, 'Pred_BoI_P', xtrain_tfidf, xtest_tfidf, df_train['BoI_P'], df_test['BoI_P'])
BoI_Q_df_test, BoI_Q_Acc = do_classify(df_test, 'Pred_BoI_Q', xtrain_tfidf, xtest_tfidf, df_train['BoI_Q'], df_test['BoI_Q'])
BoI_N_df_test, BoI_N_Acc = do_classify(df_test, 'Pred_BoI_N', xtrain_tfidf, xtest_tfidf, df_train['BoI_N'], df_test['BoI_N'])
BoA_P_df_test, BoA_P_Acc = do_classify(df_test, 'Pred_BoA_P', xtrain_tfidf, xtest_tfidf, df_train['BoA_P'], df_test['BoA_P'])
BoA_Q_df_test, BoA_Q_Acc = do_classify(df_test, 'Pred_BoA_Q', xtrain_tfidf, xtest_tfidf, df_train['BoA_Q'], df_test['BoA_Q'])
BoA_N_df_test, BoA_N_Acc = do_classify(df_test, 'Pred_BoA_N', xtrain_tfidf, xtest_tfidf, df_train['BoA_N'], df_test['BoA_N'])
In [514]:
BoI_NQP_df, BoI_NPQ_df, BoI_PQN_df, BoI_PNQ_df, BoI_QPN_df, BoI_QNP_df, BoI_NQP, BoI_NPQ, BoI_PQN, BoI_PNQ, BoI_QPN, BoI_QNP = \
order_evaluate(df_test,"BoI", BoI_N_df_test, BoI_P_df_test, BoI_Q_df_test)
BoI_NQP: 54.02298850574713
BoI_NPQ: 54.166666666666664
BoI_PQN: 56.32183908045977
BoI_PNQ: 58.620689655172406
BoI_QPN: 56.32183908045977
BoI_QNP: 54.02298850574713
In [515]:
BoA_NQP_df, BoA_NPQ_df, BoA_PQN_df, BoA_PNQ_df, BoA_QPN_df, BoA_QNP_df, BoA_NQP, BoA_NPQ, BoA_PQN, BoA_PNQ, BoA_QPN, BoA_QNP = \
order_evaluate(df_test,"BoA", BoA_N_df_test, BoA_P_df_test, BoA_Q_df_test)
BoA_NQP: 52.87356321839081
BoA_NPQ: 53.125
BoA_PQN: 51.724137931034484
BoA_PNQ: 57.47126436781609
BoA_QPN: 47.368421052631575
BoA_QNP: 48.421052631578945
In [517]:
Test_result_df = pd.DataFrame()
Test_result_df['Filename'] = BoI_PNQ_df['Filename']
Test_result_df['BoI_Validate'] = BoI_PNQ_df['BoI_Validate']
Test_result_df['BoA_Validate'] = BoA_PNQ_df['BoA_Validate']
do_pairing(pair_blinding_unlabel, Test_result_df, 'BoI_Validate', 'BoA_Validate')
Pairing complete
In [518]:
Test_result_df['Id'] = Test_result_df['Filename']
Test_result_df['Predication'] = Test_result_df['pairing_result']
del Test_result_df['BoI_Validate'], Test_result_df['BoA_Validate'], Test_result_df['pairing_result'], Test_result_df['Filename']

Test_result_df.head()
Out[518]:
Id Predication
0 00101-05.txt PP
1 00186-28.txt QQ
2 00101-12.txt PP
3 00186-21.txt PP
4 00170-17.txt QQ
In [519]:
Test_result_df.to_csv('data/FCNP/Test_Submit.csv', index = False)
In [520]:
Test_result_df['y'] = df_test['Classes']
do_pairing(score_pairing, Test_result_df, 'Predication', 'y')
Pairing complete
In [521]:
Test_result_df['Score'] = Test_result_df['pairing_result']
del Test_result_df['y'], Test_result_df['pairing_result']

Test_result_df.head()
Out[521]:
Id Predication Score
0 00101-05.txt PP 1
1 00186-28.txt QQ 0
2 00101-12.txt PP 1
3 00186-21.txt PP 1
4 00170-17.txt QQ 1
In [522]:
accurary_pair_result(Test_result_df, 'Score')
Out[522]:
60.91954022988506
In [328]:
LR_WordTFIDF_classifier.decision_function(xvalid_tfidf)
Out[328]:
array([-0.89404623, -1.4009602 , -1.22108588, -0.57296177, -0.61954885,
       -0.95992421, -0.0893131 ,  0.68450428, -0.94706255, -0.34255242,
       -0.9981059 , -1.24143738, -0.85268053, -0.68198497,  0.28011322,
        0.08737787, -0.97712605,  0.04702293, -0.97164481, -0.89900743,
        0.06502595, -0.43610601,  0.63794073, -0.76144881, -0.36658689,
       -0.43722796, -1.59695322,  0.14369085, -0.83941846, -1.64547165,
       -0.61462493, -0.46628054, -0.37456379,  0.21087659,  0.31559984,
       -0.13173766,  0.61349844, -1.27071691, -0.66926218, -0.12348428,
       -1.21771227,  0.50542211,  0.75872566, -0.84244127,  0.67725103,
       -0.14333524,  0.08149343, -1.25283455, -1.03671865,  0.01110967,
       -0.42993185, -1.04069196, -0.11587604, -1.38868448, -0.58794917,
        0.56376854,  0.23970988,  0.09162296, -0.1621431 , -0.88467353,
       -0.28965624, -0.4789487 , -0.85943172, -0.84056005, -0.44864897,
        0.63308363, -0.973627  , -1.56985808, -0.06900583,  0.27742142,
       -0.45648308, -0.77277375, -0.26034991, -0.17789847, -0.99008648,
       -1.51212311, -0.14046306, -1.74277431, -0.7851883 ,  0.01126602,
       -0.00337051,  0.02248249, -0.95213068, -0.69922509,  0.2815453 ,
       -0.84450051, -1.28320909])