-
Notifications
You must be signed in to change notification settings - Fork 30
/
SymptomSuggestion.py
285 lines (226 loc) · 10.8 KB
/
SymptomSuggestion.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
# -*- coding: utf-8 -*-
"""SymptomSuggestion.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1VCTKzIKXnReWZoBj9NQ6td6oosUz_hmj
# **Disease Detection using Symptoms and Treatment recommendation**
This notebook contains code to detect disease using the symptoms entered and selected by the user and recommends the appropriate treatments.
"""
# Predicts diseases based on the symptoms entered and selected by the user.
# importing all necessary libraries
import warnings
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split, cross_val_score
from statistics import mean
from nltk.corpus import wordnet
import requests
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from itertools import combinations
from time import time
from collections import Counter
import operator
from xgboost import XGBClassifier
import math
from Treatment import diseaseDetail
from sklearn.linear_model import LogisticRegression
warnings.simplefilter("ignore")
"""Download resources required for NLTK pre-processing"""
import nltk
nltk.download('all')
"""**synonyms function** finds the synonymous terms of a symptom entered by the user.
This is necessary as the user may use a term for a symptom which may be different from the one present in dataset.
This improves the accuracy by reducing the wrong predictions even when symptoms for a disease are entered slightly different than the ones on which model is trained.
*Synonyms are searched on Thesaurus.com and NLTK Wordnet*
"""
# returns the list of synonyms of the input word from thesaurus.com (https://www.thesaurus.com/) and wordnet (https://www.nltk.org/howto/wordnet.html)
def synonyms(term):
synonyms = []
response = requests.get('https://www.thesaurus.com/browse/{}'.format(term))
soup = BeautifulSoup(response.content, "html.parser")
try:
container=soup.find('section', {'class': 'MainContentContainer'})
row=container.find('div',{'class':'css-191l5o0-ClassicContentCard'})
row = row.find_all('li')
for x in row:
synonyms.append(x.get_text())
except:
None
for syn in wordnet.synsets(term):
synonyms+=syn.lemma_names()
return set(synonyms)
# utlities for pre-processing
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
splitter = RegexpTokenizer(r'\w+')
"""**Disease Symptom dataset** was created in a separate python program.
**Dataset scrapping** was done using **NHP website** and **wikipedia data**
Disease Combination dataset contains the combinations for each of the disease present in dataset as practically it is often observed that it is not necessary for a person to have a disease when all the symptoms are faced by the patient or the user.
*To tackle this problem, combinations are made with the symptoms for each disease.*
**This increases the size of the data exponentially and helps the model to predict the disease with much better accuracy.**
*df_comb -> Dataframe consisting of dataset generated by combining symptoms for each disease.*
*df_norm -> Dataframe consisting of dataset which contains a single row for each diseases with all the symptoms for that corresponding disease.*
**Dataset contains 261 diseases and their symptoms**
"""
# Load Dataset scraped from NHP (https://www.nhp.gov.in/disease-a-z) & Wikipedia
# Scrapping and creation of dataset csv is done in a separate program
df_comb = pd.read_csv("/content/drive/My Drive/Python Project data/IR_Project/dis_sym_dataset_comb.csv") # Disease combination
df_norm = pd.read_csv("/content/drive/My Drive/Python Project data/IR_Project/dis_sym_dataset_norm.csv") # Individual Disease
X = df_comb.iloc[:, 1:]
Y = df_comb.iloc[:, 0:1]
"""Using **Logistic Regression (LR) Classifier** as it gives better accuracy compared to other classification models as observed in the comparison of model accuracies in Model_latest.py
Cross validation is done on dataset with cv = 5
"""
lr = LogisticRegression()
lr = lr.fit(X, Y)
scores = cross_val_score(lr, X, Y, cv=5)
X = df_norm.iloc[:, 1:]
Y = df_norm.iloc[:, 0:1]
# List of symptoms
dataset_symptoms = list(X.columns)
"""# Symptoms initially taken from user."""
# Taking symptoms from user as input
user_symptoms = str(input("Please enter symptoms separated by comma(,):\n")).lower().split(',')
# Preprocessing the input symptoms
processed_user_symptoms=[]
for sym in user_symptoms:
sym=sym.strip()
sym=sym.replace('-',' ')
sym=sym.replace("'",'')
sym = ' '.join([lemmatizer.lemmatize(word) for word in splitter.tokenize(sym)])
processed_user_symptoms.append(sym)
"""Pre-processing on symptoms entered by user is done."""
# Taking each user symptom and finding all its synonyms and appending it to the pre-processed symptom string
user_symptoms = []
for user_sym in processed_user_symptoms:
user_sym = user_sym.split()
str_sym = set()
for comb in range(1, len(user_sym)+1):
for subset in combinations(user_sym, comb):
subset=' '.join(subset)
subset = synonyms(subset)
str_sym.update(subset)
str_sym.add(' '.join(user_sym))
user_symptoms.append(' '.join(str_sym).replace('_',' '))
# query expansion performed by joining synonyms found for each symptoms initially entered
print("After query expansion done by using the symptoms entered")
print(user_symptoms)
"""The below procedure is performed in order to show the symptom synonmys found for the symptoms entered by the user.
The symptom synonyms and user symptoms are matched with the symptoms present in dataset. Only the symptoms which matches the symptoms present in dataset are shown back to the user.
"""
# Loop over all the symptoms in dataset and check its similarity score to the synonym string of the user-input
# symptoms. If similarity>0.5, add the symptom to the final list
found_symptoms = set()
for idx, data_sym in enumerate(dataset_symptoms):
data_sym_split=data_sym.split()
for user_sym in user_symptoms:
count=0
for symp in data_sym_split:
if symp in user_sym.split():
count+=1
if count/len(data_sym_split)>0.5:
found_symptoms.add(data_sym)
found_symptoms = list(found_symptoms)
"""## **Prompt the user to select the relevant symptoms by entering the corresponding indices.**"""
# Print all found symptoms
print("Top matching symptoms from your search!")
for idx, symp in enumerate(found_symptoms):
print(idx,":",symp)
# Show the related symptoms found in the dataset and ask user to select among them
select_list = input("\nPlease select the relevant symptoms. Enter indices (separated-space):\n").split()
# Find other relevant symptoms from the dataset based on user symptoms based on the highest co-occurance with the
# ones that is input by the user
dis_list = set()
final_symp = []
counter_list = []
for idx in select_list:
symp=found_symptoms[int(idx)]
final_symp.append(symp)
dis_list.update(set(df_norm[df_norm[symp]==1]['label_dis']))
for dis in dis_list:
row = df_norm.loc[df_norm['label_dis'] == dis].values.tolist()
row[0].pop(0)
for idx,val in enumerate(row[0]):
if val!=0 and dataset_symptoms[idx] not in final_symp:
counter_list.append(dataset_symptoms[idx])
"""## To find symptoms which generally co-occur, for example with symptoms like cough, headache generally happens hence they co-occur."""
# Symptoms that co-occur with the ones selected by user
dict_symp = dict(Counter(counter_list))
dict_symp_tup = sorted(dict_symp.items(), key=operator.itemgetter(1),reverse=True)
#print(dict_symp_tup)
"""## User is presented with a list of co-occuring symptoms to select from and is performed iteratively to recommend more possible symptoms based on the similarity to the previously entered symptoms.
As the co-occuring symptoms can be in overwhelming numbers, only the top 5 are recommended to the user from which user can select the symptoms.
If user does not have any of those 5 symptoms and wants to see the next 5, he can do so by giving input as -1.
To stop the recommendation, user needs to give input as "No".
"""
# Iteratively, suggest top co-occuring symptoms to the user and ask to select the ones applicable
found_symptoms=[]
count=0
for tup in dict_symp_tup:
count+=1
found_symptoms.append(tup[0])
if count%5==0 or count==len(dict_symp_tup):
print("\nCommon co-occuring symptoms:")
for idx,ele in enumerate(found_symptoms):
print(idx,":",ele)
select_list = input("Do you have have of these symptoms? If Yes, enter the indices (space-separated), 'no' to stop, '-1' to skip:\n").lower().split();
if select_list[0]=='no':
break
if select_list[0]=='-1':
found_symptoms = []
continue
for idx in select_list:
final_symp.append(found_symptoms[int(idx)])
found_symptoms = []
"""Final Symptom list"""
# Create query vector based on symptoms selected by the user
print("\nFinal list of Symptoms that will be used for prediction:")
sample_x = [0 for x in range(0,len(dataset_symptoms))]
for val in final_symp:
print(val)
sample_x[dataset_symptoms.index(val)]=1
"""Prediction of disease is done"""
# Predict disease
lr = LogisticRegression()
lr = lr.fit(X, Y)
prediction = lr.predict_proba([sample_x])
"""Show top k diseases and their probabilities to the user.
K in this case is 10
"""
k = 10
diseases = list(set(Y['label_dis']))
diseases.sort()
topk = prediction[0].argsort()[-k:][::-1]
"""# **Showing the list of top k diseases to the user with their prediction probabilities.**
# **For getting information about the suggested treatments, user can enter the corresponding index to know more details.**
"""
print(f"\nTop {k} diseases predicted based on symptoms")
topk_dict = {}
# Show top 10 highly probable disease to the user.
for idx,t in enumerate(topk):
match_sym=set()
row = df_norm.loc[df_norm['label_dis'] == diseases[t]].values.tolist()
row[0].pop(0)
for idx,val in enumerate(row[0]):
if val!=0:
match_sym.add(dataset_symptoms[idx])
prob = (len(match_sym.intersection(set(final_symp)))+1)/(len(set(final_symp))+1)
prob *= mean(scores)
topk_dict[t] = prob
j = 0
topk_index_mapping = {}
topk_sorted = dict(sorted(topk_dict.items(), key=lambda kv: kv[1], reverse=True))
for key in topk_sorted:
prob = topk_sorted[key]*100
print(str(j) + " Disease name:",diseases[key], "\tProbability:",str(round(prob, 2))+"%")
topk_index_mapping[j] = key
j += 1
select = input("\nMore details about the disease? Enter index of disease or '-1' to discontinue and close the system:\n")
if select!='-1':
dis=diseases[topk_index_mapping[int(select)]]
print()
print(diseaseDetail(dis))