import pandas as pd import SimpleITK as sitk import os import os.path import numpy as np fold = 1 def load_itk_image(filename): with open(filename) as f: contents = f.readlines() line = [k for k in contents if k.startswith('TransformMatrix')][0] transformM = np.array(line.split(' = ')[1].split(' ')).astype('float') transformM = np.round(transformM) if np.any( transformM!=np.array([1,0,0, 0, 1, 0, 0, 0, 1])): isflip = True else: isflip = False itkimage = sitk.ReadImage(filename) numpyImage = sitk.GetArrayFromImage(itkimage) numpyOrigin = np.array(list(reversed(itkimage.GetOrigin()))) numpySpacing = np.array(list(reversed(itkimage.GetSpacing()))) return numpyImage, numpyOrigin, numpySpacing,isflip def worldToVoxelCoord(worldCoord, origin, spacing): stretchedVoxelCoord = np.absolute(worldCoord - origin) voxelCoord = stretchedVoxelCoord / spacing return voxelCoord # read map file mapfname = 'LIDC-IDRI-mappingLUNA16' sidmap = {} fid = open(mapfname, 'r') line = fid.readline() line = fid.readline() while line: pidlist = line.split(' ') # print pidlist pid = pidlist[0] stdid = pidlist[1] srsid = pidlist[2] if srsid not in sidmap: sidmap[srsid] = [pid, stdid] else: assert sidmap[srsid][0] == pid assert sidmap[srsid][1] == stdid line = fid.readline() fid.close() # read luna16 annotation colname = ['seriesuid', 'coordX', 'coordY', 'coordZ', 'diameter_mm'] lunaantframe = pd.read_csv('annotations.csv', names=colname) srslist = lunaantframe.seriesuid.tolist()[1:] cdxlist = lunaantframe.coordX.tolist()[1:] cdylist = lunaantframe.coordY.tolist()[1:] cdzlist = lunaantframe.coordZ.tolist()[1:] dimlist = lunaantframe.diameter_mm.tolist()[1:] lunaantdict = {} for idx in xrange(len(srslist)): vlu = [float(cdxlist[idx]), float(cdylist[idx]), float(cdzlist[idx]), float(dimlist[idx])] if srslist[idx] in lunaantdict: lunaantdict[srslist[idx]].append(vlu) else: lunaantdict[srslist[idx]] = [vlu] # # convert luna16 annotation to LIDC-IDRI annotation space # from multiprocessing import Pool # lunantdictlidc = {} # for fold in xrange(10): # mhdpath = '/media/data1/wentao/tianchi/luna16/subset'+str(fold) # print 'fold', fold # def getvoxcrd(fname): # sliceim,origin,spacing,isflip = load_itk_image(os.path.join(mhdpath, fname)) # lunantdictlidc[fname[:-4]] = [] # voxcrdlist = [] # for lunaant in lunaantdict[fname[:-4]]: # voxcrd = worldToVoxelCoord(lunaant[:3][::-1], origin, spacing) # voxcrd[-1] = sliceim.shape[0] - voxcrd[0] # voxcrdlist.append(voxcrd) # return voxcrdlist # p = Pool(30) # fnamelist = [] # for fname in os.listdir(mhdpath): # if fname.endswith('.mhd') and fname[:-4] in lunaantdict: # fnamelist.append(fname) # voxcrdlist = p.map(getvoxcrd, fnamelist) # listidx = 0 # for fname in os.listdir(mhdpath): # if fname.endswith('.mhd') and fname[:-4] in lunaantdict: # lunantdictlidc[fname[:-4]] = [] # for subidx, lunaant in enumerate(lunaantdict[fname[:-4]]): # # voxcrd = worldToVoxelCoord(lunaant[:3][::-1], origin, spacing) # # voxcrd[-1] = sliceim.shape[0] - voxcrd[0] # lunantdictlidc[fname[:-4]].append([lunaant, voxcrdlist[listidx][subidx]]) # listidx += 1 # p.close() # np.save('lunaantdictlidc.npy', lunantdictlidc) # read LIDC dataset lunantdictlidc = np.load('lunaantdictlidc.npy').item() import xlrd lidccsvfname = '/media/data1/wentao/LIDC-IDRI/list3.2.xls' antdict = {} wb = xlrd.open_workbook(os.path.join(lidccsvfname)) for s in wb.sheets(): if s.name == 'list3.2': for row in range(1, s.nrows): valuelist = [int(s.cell(row, 2).value), s.cell(row, 3).value, s.cell(row, 4).value, \ int(s.cell(row, 5).value), int(s.cell(row, 6).value), int(s.cell(row, 7).value)] assert abs(s.cell(row, 1).value - int(s.cell(row, 1).value)) < 1e-8 assert abs(s.cell(row, 2).value - int(s.cell(row, 2).value)) < 1e-8 assert abs(s.cell(row, 5).value - int(s.cell(row, 5).value)) < 1e-8 assert abs(s.cell(row, 6).value - int(s.cell(row, 6).value)) < 1e-8 assert abs(s.cell(row, 7).value - int(s.cell(row, 7).value)) < 1e-8 for col in range(9, 16): if s.cell(row, col).value != '': if isinstance(s.cell(row, col).value, float): valuelist.append(str(int(s.cell(row, col).value))) assert abs(s.cell(row, col).value - int(s.cell(row, col).value)) < 1e-8 else: valuelist.append(s.cell(row, col).value) if s.cell(row, 0).value+'_'+str(int(s.cell(row, 1).value)) not in antdict: antdict[s.cell(row, 0).value+'_'+str(int(s.cell(row, 1).value))] = [valuelist] else: antdict[s.cell(row, 0).value+'_'+str(int(s.cell(row, 1).value))].append(valuelist) # update LIDC annotation with series number, rather than scan id import dicom LIDCpath = '/media/data1/wentao/LIDC-IDRI/DOI/' antdictscan = {} for k, v in antdict.iteritems(): pid, scan = k.split('_') hasscan = False for sdu in os.listdir(os.path.join(LIDCpath, 'LIDC-IDRI-'+pid)): for srs in os.listdir(os.path.join(*[LIDCpath, 'LIDC-IDRI-'+pid, sdu])): if srs.endswith('.npy'): print 'npy', pid, scan, srs continue RefDs = dicom.read_file(os.path.join(*[LIDCpath, 'LIDC-IDRI-'+pid, sdu, srs, '000006.dcm'])) # print scan, str(RefDs[0x20, 0x11].value) if str(RefDs[0x20, 0x11].value) == scan or scan == '0': if hasscan: print 'rep', pid, sdu, srs hasscan = True antdictscan[pid+'_'+srs] = v break if not hasscan: print 'not found', pid, scan, sdu, srs # find the match from LIDC-IDRI annotation import math lunaantdictnodid = {} maxdist = 0 for srcid, lunaantlidc in lunantdictlidc.iteritems(): lunaantdictnodid[srcid] = [] pid, stdid = sidmap[srcid] # print pid pid = pid[len('LIDC-IDRI-'):] for lunantdictlidcsub in lunaantlidc: lunaant = lunantdictlidcsub[0] voxcrd = lunantdictlidcsub[1] # z y x mindist, minidx = 1e8, -1 if srcid in ['1.3.6.1.4.1.14519.5.2.1.6279.6001.174692377730646477496286081479', '1.3.6.1.4.1.14519.5.2.1.6279.6001.300246184547502297539521283806']: continue for idx, lidcant in enumerate(antdictscan[pid+'_'+srcid]): dist = math.pow(voxcrd[0] - lidcant[3], 2) # z dist = math.pow(voxcrd[1] - lidcant[4], 2) # y dist += math.pow(voxcrd[2] - lidcant[5], 2) # x if dist < mindist: mindist = dist minidx = idx if mindist > 71:#15.1: print srcid, pid, voxcrd, antdictscan[pid+'_'+srcid], mindist maxdist = max(maxdist, mindist) lunaantdictnodid[srcid].append([lunaant, antdictscan[pid+'_'+srcid][minidx][6:]]) # np.save('lunaantdictnodid.npy', lunaantdictnodid) print 'maxdist', maxdist # save it into a csv # import csv # savename = 'annotationnodid.csv' # fid = open(savename, 'w') # writer = csv.writer(fid) # writer.writerow(['seriesuid', 'coordX', 'coordY', 'coordZ', 'diameter_mm']) # for srcid, ant in lunaantdictnodid.iteritems(): # for antsub in ant: # writer.writerow([srcid] + [antsub[0][0], antsub[0][1], antsub[0][2], antsub[0][3]] + antsub[1]) # fid.close() # fd 1 fd1lst = [] for fname in os.listdir('/media/data1/wentao/tianchi/luna16/subset'+str(fold)+'/'): if fname.endswith('.mhd'): fd1lst.append(fname[:-4]) # find the malignancy, shape information from xml file import xml.dom.minidom ndoc = 0 lunadctclssgmdict = {} mallstall, callstall, sphlstall, marlstall, loblstall, spilstall, texlstall = [], [], [], [], [], [], [] for srsid, extant in lunaantdictnodid.iteritems(): if srsid not in fd1lst: continue lunadctclssgmdict[srsid] = [] pid, stdid = sidmap[srsid] for extantvlu in extant: getnodid = [] nant = 0 mallst = [] for fname in os.listdir(os.path.join(*['/media/data1/wentao/LIDC-IDRI/DOI/', pid, stdid, srsid])): if fname.endswith('.xml'): nant += 1 dom = xml.dom.minidom.parse(os.path.join(*['/media/data1/wentao/LIDC-IDRI/DOI/', pid, stdid, srsid, fname])) root = dom.documentElement rsessions = root.getElementsByTagName('readingSession') for rsess in rsessions: unblinds = rsess.getElementsByTagName('unblindedReadNodule') for unb in unblinds: nod = unb.getElementsByTagName('noduleID') if len(nod) != 1: print 'more nod', nod continue if nod[0].firstChild.data in extantvlu[1]: getnodid.append(nod[0].firstChild.data) mal = unb.getElementsByTagName('malignancy') if len(mal) == 1 and int(mal[0].firstChild.data) in range(1, 6, 1): mallst.append(float(mal[0].firstChild.data)) # print(getnodid, extantvlu[1], nant) if len(getnodid) > len(extantvlu[1]): print pid, srsid # assert 1 == 0 ndoc = max(ndoc, len(getnodid), len(extantvlu[1])) vlulst = [srsid, extantvlu[0][0], extantvlu[0][1], extantvlu[0][2], extantvlu[0][3]] if len(mallst) == 0: vlulst.append(0) else: vlulst.append(sum(mallst)/float(len(mallst))) lunadctclssgmdict[srsid].append(vlulst+mallst) import csv # load predition array # pixdimpred = np.load('../../../CTnoddetector/training/nodcls/checkpoint-'+str(fold)+'/besttestpred.npy')#'pixradiustest.npy') pdframe = pd.read_csv('annotationdetclsconv_v3.csv', names=['seriesuid', 'coordX', 'coordY', 'coordZ', 'diameter_mm', 'malignant']) srslst = pdframe['seriesuid'].tolist()[1:] crdxlst = pdframe['coordX'].tolist()[1:] crdylst = pdframe['coordY'].tolist()[1:] crdzlst = pdframe['coordZ'].tolist()[1:] dimlst = pdframe['diameter_mm'].tolist()[1:] mlglst = pdframe['malignant'].tolist()[1:] newlst = [] import csv # fid = open('annotationdetclsconvfnl_v3.csv', 'w') # writer = csv.writer(fid) # writer.writerow(['seriesuid', 'coordX', 'coordY', 'coordZ', 'diameter_mm', 'malignant']) for i in xrange(len(srslst)): # writer.writerow([srslst[i]+'-'+str(i), crdxlst[i], crdylst[i], crdzlst[i], dimlst[i], mlglst[i]]) newlst.append([srslst[i]+'-'+str(i), crdxlst[i], crdylst[i], crdzlst[i], dimlst[i], mlglst[i]]) # fid.close() subset1path = '/media/data1/wentao/tianchi/luna16/subset'+str(fold)+'/' testfnamelst = [] for fname in os.listdir(subset1path): if fname.endswith('.mhd'): testfnamelst.append(fname[:-4]) ntest = 0 for idx in xrange(len(newlst)): fname = newlst[idx][0] if fname.split('-')[0] in testfnamelst: ntest +=1 print 'ntest', ntest, 'ntrain', len(newlst)-ntest prednamelst = {} predacc = 0 predidx = 0 # predlabellst = [] # for idx in xrange(len(newlst)): # fname = newlst[idx][0] # if fname.split('-')[0] in testfnamelst: # # print newlst[idx][-1], pixdimpred[predidx] # if int(pixdimpred[predidx]>0.5) == int(newlst[idx][-1]): predacc += 1 # if fname.split('-')[0] not in prednamelst: # prednamelst[fname.split('-')[0]] = [pixdimpred[predidx]] # else: # prednamelst[fname.split('-')[0]].append(pixdimpred[predidx]) # predidx += 1 # print 'pred acc', predacc/float(predidx) pixdimidx = -1 # savename = 'annotationdetclssgm_doctor_fd2.csv' # fid = open(savename, 'w') # writer = csv.writer(fid) # writer.writerow(['seriesuid', 'coordX', 'coordY', 'coordZ', 'diameter_mm', 'malignant']) # get the patient level cancer ptlabel = {} nfold1 = 0 for srsid, extant in lunadctclssgmdict.iteritems(): if srsid not in fd1lst: continue nfold1 += 1 for subextant in extant: if subextant[5] in [3, 0]: continue if abs(subextant[5] - 3) < 1e-2: continue if subextant[5] > 3: ptlabel[srsid] = 1 break else: ptlabel[srsid] = 0 print(len(lunadctclssgmdict.keys()), nfold1, len(ptlabel.keys()), sum(ptlabel.values())) # get doctors prediction on patient level cancer dctptlabel = {} for srsid, extant in lunadctclssgmdict.iteritems(): curidx = 0 if srsid not in fd1lst: continue for subextant in extant: if subextant[5] in [3, 0]: continue if abs(subextant[5] - 3) < 1e-2: continue dctptlabel[srsid] = [-1]*max(len(subextant) - 6, 4) # writer.writerow(subextant) for did in xrange(6, len(subextant),1):#len(subextant), 1): # if 0.499 <= prednamelst[srsid][curidx] <= 0.501: continue if subextant[did] == 3: continue if subextant[did] > 3: dctptlabel[srsid][did-6] = 1 if dctptlabel[srsid][did-6] == -1: dctptlabel[srsid][did-6] = 0 # calculate patient level performance p = [0]*4 n = [0]*4 tp = [0]*4 np = [0]*4 for srs in ptlabel.keys(): if dctptlabel[srs][0] != -1: n[0] += 1 if ptlabel[srs] == 1: np[0] += 1 if ptlabel[srs] == dctptlabel[srs][0]: p[0] += 1 if ptlabel[srs] == 1: tp[0] += 1 if dctptlabel[srs][1] != -1: n[1] += 1 if ptlabel[srs] == 1: np[1] += 1 if ptlabel[srs] == dctptlabel[srs][1]: p[1] += 1 if ptlabel[srs] == 1: tp[1] += 1 if dctptlabel[srs][2] != -1: n[2] += 1 if ptlabel[srs] == 1: np[2] += 1 if ptlabel[srs] == dctptlabel[srs][2]: p[2] += 1 if ptlabel[srs] == 1: tp[2] += 1 if dctptlabel[srs][3] != -1: n[3] += 1 if ptlabel[srs] == 1: np[3] += 1 if ptlabel[srs] == dctptlabel[srs][3]: p[3] += 1 if ptlabel[srs] == 1: tp[3] += 1 print(p, n, tp, np, len(ptlabel)) print(p[0]/float(n[0]), tp[0]/float(np[0]), (p[0]-tp[0])/float(n[0]-np[0])) print(p[1]/float(n[1]), tp[1]/float(np[1]), (p[1]-tp[1])/float(n[1]-np[1])) print(p[2]/float(n[2]), tp[2]/float(np[2]), (p[2]-tp[2])/float(n[2]-np[2])) print(p[3]/float(n[3]), tp[3]/float(np[3]), (p[3]-tp[3])/float(n[3]-np[3])) import numpy as np np.save('ptlabel'+str(fold)+'.npy', ptlabel) np.save('dctptlabel'+str(fold)+'.npy', dctptlabel)