from __future__ import unicode_literals __author__ = 'kangtian' import re,nltk import xml.etree.ElementTree as xml_parser from web import download_web_data from nltk.corpus import stopwords from nltk.stem.lancaster import LancasterStemmer st = LancasterStemmer() ################# download train corpus : start ######################## def get_disease_clinical_trials (ldisease): #retrieve Trial IDs from clincialtrials.gov disease_to_nct = {} stat = [] #ldisease = sorted(map(lambda x:' '.join(x.lower().split()), ldisease)) trial_ids = [] i=1 d = ldisease.replace (',', '') fd = d.replace(' ', '+') url = 'https://clinicaltrials.gov/ct2/results?cond=%s&displayxml=true' # num. of studies available if ldisease=='all': url='https://clinicaltrials.gov/ct2/results?%sdisplayxml=true' fd='' print url xml = download_web_data(url % (fd)) xmltree = xml_parser.fromstring (xml) n = int(xmltree.get ('count')) print n i=i+1 nct = set() url_final = url + '&start=%d&count=%s'; for j in range(1, n, 1): if j % 5000 ==0: print "id: 0-",j," ..." xmltree = xml_parser.fromstring (download_web_data(url_final % (fd, j, 1))) lnct = xmltree.findall ('clinical_study') for ct in lnct: cod = ct.find ('nct_id') if cod is None: continue #print "trial %s"%cod.text trial_ids.append((cod.text)) return trial_ids def extract_criteria(cid): #Using IDs to retrieve eligibility criteria output = "" if cid is not None: url_trial = 'http://clinicaltrials.gov/show/%s?displayxml=true' #url_trial ='http://clinicaltrials.gov/search?term=%s&displayxml=true' page = download_web_data(url_trial % cid) #with codecs.open('temp.txt', 'w','utf8') as writer: # writer.write(page) #with codec.open('temp.txt', 'r', 'utf8') as reader: if page is not None: ct_xml = xml_parser.fromstring (page) ec = ct_xml.find ('eligibility') if ec is not None: # parse to get criteria text d = ec.find ('criteria') if d is not None: txt = d.find ('textblock') if txt is not None: output = txt.text return output def extract_description(cid): #Using IDs to retrieve eligibility criteria output = "" if cid is not None: url_trial = 'http://clinicaltrials.gov/show/%s?displayxml=true' #url_trial ='http://clinicaltrials.gov/search?term=%s&displayxml=true' page = download_web_data(url_trial % cid) #with codecs.open('temp.txt', 'w','utf8') as writer: # writer.write(page) #with codec.open('temp.txt', 'r', 'utf8') as reader: if page is not None: ct_xml = xml_parser.fromstring (page) summary = ct_xml.find ('brief_summary') if summary is not None: txt = summary.find ('textblock') if txt is not None: output = txt.text description = ct_xml.find('detailed_description') if description is not None: txt2 = summary.find('textblock') if txt2 is not None: output = output+txt2.text return output def sentence_splitting (texts, slen = 1): # Split ec into seperated sentences. if len(texts) <= 0: return [] # splitting sentences = [] text_sents = nltk.sent_tokenize(texts) if (text_sents != [''] and len(text_sents) > 0): for sent in text_sents: sent=re.sub('e.g.','eg',sent) sent = sent.strip().split('\r') # split strings that contains "\r" for sen in sent: se = re.split('[.;]',sen) for s in se: ss=s.split('- ') for final in ss: #print final match=re.match('^\d+\.\s*$',final) if match: continue final=re.sub('\s+$','',final) final=re.sub('\d+\.','',final) final=final.encode('utf-8').decode('utf-8','ignore').encode("utf-8") words=final.decode('ascii', 'ignore').split(' ') new_words=[] for w in words: if w: #print "=="+w+"==" match=re.search('(\(*\w+\)*,*.*)',w) if match: #print match.group(1) new_words.append(match.group(1)) new_sent=' '.join(new_words) if new_sent: sentences.append(new_sent) #print new_sent return sentences def retrieve_train_corpurs(input_condition,new_trian_addresss): # Main function, retrieve ec and stemmed the words, save into files myfile=open(new_trian_addresss,'w') print '...retrieving the train corpus on ' + input_condition + '...' list=get_disease_clinical_trials(input_condition) print "...trial_id retrieved!" i=0 for id in list: i+=1 if i%10000==0: print "...0- ",i," texts retrieved..." print >>myfile,'>>'+id # print id ec=extract_criteria(id) sents=preprocessing(ec,slen=1) for s in sents: #print s print >>myfile,s print 'train corpus are successfully retrieved !' ################# download train corpus : END ######################## #retrieve_train_corpurs('type II diabetes','files/DMII.txt') #a=extract_criteria("NCT00000105") #print a def stem_sent(sent): # stem the words and remove the numbers and other symbols words=sent.split(' ') stem=[] for w in words: W=re.match('^[\W+|\d+]$',w) if W: continue #stem.append(st.stem(w)) stem.append(w) stemmed_sent=' '.join(stem) return stemmed_sent def remove_stopwords(sent):# remove the English stop words in the sentence stopwords_list=(stopwords.words('english')) list=['years','year','months','month','day','days'] for w in list: stopwords_list.append(w) new_sent = ' '.join([word for word in sent.decode('ascii', 'ignore').split() if word not in stopwords_list]) return new_sent