function sPROFILER_main(fileName, newChipFileName) fprintf('ChipName\tCallRate_GSEQ\tCallRate_sPROFILER\tNo-Calls_GSEQ\tNo-Calls_sPROFILER\tVariants_GSEQ\tVariants_sPROFILER\n'); fidChips=fopen(fileName,'r'); chips=textread(fileName, '%s','delimiter','\n','whitespace',''); firstChip=fgetl(fidChips); numChips=length(chips); fidNewChip=fopen(newChipFileName, 'r'); newChip=textread(newChipFileName, '%s','delimiter','\n','whitespace',''); callsFile=strcat(firstChip,'\',firstChip,'_calls.txt'); intFile=strcat(firstChip,'\',firstChip,'_int.txt'); fidCall = fopen(callsFile, 'r'); fidInt = fopen(intFile, 'r'); refSeq= fscanf(fidCall, '%s %s', [2 inf]); refSeq = refSeq'; numBP=length(refSeq); allCallsArray=refSeq(:,2); refSeq=refSeq(:,1); intFirst= fscanf(fidInt, '%g %g %g %g %g %g %g %g', [8 inf]); allIntArray=intFirst'; fclose(fidInt); fclose(fidCall); fclose(fidChips); fidChips=fopen(fileName,'r'); for index=2:numChips chipNum=fgetl(fidChips); callsFile=strcat(chipNum,'\',chipNum,'_calls.txt'); intFile=strcat(chipNum,'\',chipNum,'_int.txt'); fidCall = fopen(callsFile, 'r'); fidInt = fopen(intFile, 'r'); tempSeq= fscanf(fidCall, '%s %s', [2 inf]); tempSeq=tempSeq'; tempSeq=tempSeq(:,2); allCallsArray=[allCallsArray, tempSeq]; intArray= fscanf(fidInt, '%g %g %g %g %g %g %g %g', [8 inf]); allIntArray=[allIntArray, intArray']; fclose(fidCall); fclose(fidInt); end fclose(fidChips); varChipRatio=zeros(numBP,3); for numRows=1:numBP numWt=0; numCalls=0; for numCols=1:numChips if(allCallsArray(numRows, numCols)==refSeq(numRows)) numWt=numWt+1; intMatFwd=allIntArray(numRows,numChips:numChips+3); intMatRev=allIntArray(numRows,numChips+4:numChips+7); sort(intMatFwd,2); sort(intMatRev,2); varChipRatio(numRows,1)=varChipRatio(numRows,1)+intMatFwd(4)/(numChips*intMatFwd(3)); varChipRatio(numRows,2)=varChipRatio(numRows,2)+intMatRev(4)/(numChips*intMatRev(3)); end if(allCallsArray(numRows, numCols)~='n') numCalls=numCalls+1; end end if(numWt>0) varChipRatio(numRows,3)=numWt/numCalls; else varChipRatio(numRows,1)=1; varChipRatio(numRows,2)=1; varChipRatio(numRows,3)=1; end end %========================================================================================================% chipNum=fgetl(fidNewChip); sPROFILER_call(chipNum, varChipRatio); fclose(fidNewChip); %XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX function BaseCallHetVal=BaseCallHetVal(chipNum, arrayPosRatio) fprintf('%s\t', chipNum); %================================================================================================================================% %Read input files; create output files. callsFile=strcat(chipNum,'\',chipNum,'_calls.txt'); intFile=strcat(chipNum,'\',chipNum,'_int.txt'); outFile=strcat(chipNum,'\',chipNum,'_OutCalls.txt'); %Read tiling information tilingIndices=dlmread('Cincinnati_tiling.txt'); %Store intensities into nX8 matrix. setInt = dlmread(intFile, '\t'); [numBP,colsBP]=size(setInt); %Store quality scores into nX1 matrix. qualFile=strcat(chipNum,'\',chipNum,'_qualScores.txt'); qualArray=dlmread(qualFile); %Read Ref and GDAS sequences. fid = fopen(callsFile, 'r'); setCalls = fscanf(fid, '%s %s', [2 inf]); setCalls = setCalls'; fclose(fid); newCalls=zeros(numBP,1); score=zeros(numBP,1); model=zeros(numBP,1); bases=['a' 'c' 'g' 't']; hets=['x' 'm' 'r' 'w'; 'm' 'x' 's' 'y'; 'r' 's' 'x' 'k'; 'w' 'y' 'k' 'x']; fid = fopen(outFile, 'wt'); fprintf(fid, 'Index\tRef\tNew\tScore\tGSEQ\tGSEQQual\tModel\tFlag\tA\tC\tG\tT\tA"\tC"\tG"\tT"\n'); numCallsOld=0; numCalls=0; oldHet=0; oldSnp=0; oldHetWithSeq=0; oldSnpWithSeq=0; newHet=0; newSnp=0; newHetWithSeq=0; newSnpWithSeq=0; varToN=0; varToNWithSeq=0; %========================================================================================================================= intMatFwd=setInt(:,1:4); intMatRev=setInt(:,5:8); sort(intMatFwd,2); sort(intMatRev,2); peak43F=0; peak43R=0; peak32F=0; peak32R=0; meanRatioFwd=0; meanRatioRev=0; numForThresh=0; for temp=1:numBP if (setCalls(temp,2)~='n') if(setCalls(temp,2)=='a'|setCalls(temp,2)=='c'|setCalls(temp,2)=='g'|setCalls(temp,2)=='t') peak43F=peak43F+intMatFwd(temp,4)/intMatFwd(temp,3); peak43R=peak43R+intMatRev(temp,4)/intMatRev(temp,3); peak32F=peak32F+intMatFwd(temp,3)/intMatFwd(temp,2); peak32R=peak32R+intMatRev(temp,3)/intMatRev(temp,2); numForThresh=numForThresh+1; end end end peak43F=peak43F/numForThresh; peak43R=peak43R/numForThresh; peak32F=peak32F/numForThresh; peak32R=peak32R/numForThresh; %========================================================================================================================= NCWt=0; NCHet=0; NCSnp=0; %==========================================% for temp=1:numBP %Check if >75% calls are N in +/-12-bp window tilingIndex=tilingIndices(temp); nCallsInWindow=0; windowLenFwd=1; windowLenRev=1; tilingFlagRev=1; tilingFlagFwd=1; if(temp>=13 & temp<=numBP-13) while(tilingFlagRev | tilingFlagFwd) if((tilingIndices(temp-windowLenRev)==tilingIndex-windowLenRev) & windowLenRev<=11 & tilingFlagRev) if(setCalls(temp-windowLenRev,2)~=setCalls(temp-windowLenRev,1)) nCallsInWindow=nCallsInWindow+1; end windowLenRev=windowLenRev+1; else tilingFlagRev=0; end if((tilingIndices(temp+windowLenFwd)==tilingIndex+windowLenFwd) & windowLenFwd<=11 & tilingFlagFwd) if(setCalls(temp+windowLenFwd,2)~=setCalls(temp+windowLenFwd,1)) nCallsInWindow=nCallsInWindow+1; end windowLenFwd=windowLenFwd+1; else tilingFlagFwd=0; end end end if(nCallsInWindow/(windowLenFwd+windowLenRev)>=0.60 | (setCalls(temp,2)~='n' & qualArray(temp)<55 & (setCalls(temp,1) ~= setCalls(temp,2)))) newCalls(temp,1)='n'; %Only no-calls are analyzed. elseif(setCalls(temp,2)=='n') [fwdVal, fwdIndex] = sort(setInt(temp,1:4),2); [revVal, revIndex] = sort(setInt(temp,5:8),2); fwdTwin=[fwdIndex(3) fwdIndex(4)]; revTwin=[revIndex(3) revIndex(4)]; %If peak on either strand exceeds next highest by %peakDiff*2ndpeak AND it corresponts to RefSeq, assign it as RefSeq. Model = 3. if(arrayPosRatio(temp,1)==1) arrayPosRatio(temp,1)=peak43F; end if(arrayPosRatio(temp,2)==1) arrayPosRatio(temp,2)=peak43R; end if (((bases(revIndex(4))==setCalls(temp,1))&(revVal(4)>=revVal(3)*mean([peak43R/arrayPosRatio(temp,3)])))|((bases(5-fwdIndex(4))==setCalls(temp,1))&(fwdVal(4)>=fwdVal(3)*mean([peak43F/arrayPosRatio(temp,3)])))) newCalls(temp,1)=setCalls(temp,1); score(temp,1)=(fwdVal(4)*revVal(4)*fwdVal(4)*revVal(4))/(fwdVal(3)*revVal(3)*mean(fwdVal(1:3))*mean(revVal(1:3))); model(temp,1)=3; else newCalls(temp,1)='n'; end else newCalls(temp,1)=setCalls(temp,2); end %======================================================================================================================== %Compute new call rate if(char(newCalls(temp,1))~='n') numCalls=numCalls+1; end if (setCalls(temp,2)~='n') numCallsOld=numCallsOld+1; end %Calculate Var calls converted to N if(char(newCalls(temp,1))=='n'& setCalls(temp,2)~='n'& setCalls(temp,2)~=setCalls(temp,1)) varToN=varToN+1; end %Matches with reference seq. if(newCalls(temp,1)==setCalls(temp,1) & newCalls(temp,1)==setCalls(temp,2)) snpType=0; %No-call converted to refseq. elseif(newCalls(temp,1)==setCalls(temp,1) & setCalls(temp,2)=='n') snpType=1; NCWt=NCWt+1; %Hets from GDAS (old Hets) elseif(setCalls(temp,2)~='a'&setCalls(temp,2)~='c'&setCalls(temp,2)~='g'&setCalls(temp,2)~='t'&setCalls(temp,2)~='n') snpType=2; oldHet=oldHet+1; %SNP's from GDAS (old SNPs) elseif((setCalls(temp,2)=='a'|setCalls(temp,2)=='c'|setCalls(temp,2)=='g'|setCalls(temp,2)=='t')&setCalls(temp,2)~=setCalls(temp,1)) snpType=3; oldSnp=oldSnp+1; %Still a no-call. elseif(newCalls(temp,1)=='n') snpType=4; end fprintf(fid, '%d\t%c\t%c\t%g\t%c\t%g\t%d\t%d\t%g\t%g\t%g\t%g\t%g\t%g\t%g\t%g\n', temp, setCalls(temp,1), newCalls(temp,1), score(temp,1), setCalls(temp,2), qualArray(temp), model(temp,1), snpType, setInt(temp, 1), setInt(temp,2), setInt(temp,3), setInt(temp,4), setInt(temp,5), setInt(temp,6), setInt(temp,7), setInt(temp,8)); end fclose(fid); %CallRate fprintf('%g\t%g\t',100*numCallsOld/(numBP),100*numCalls/(numBP)); %No-calls fprintf('%g\t%g\t',numBP-numCallsOld, numBP-numCalls); %Variants fprintf('%d\t%d\t',oldSnp+oldHet,oldSnp+oldHet-varToN); %=========================================================================================================================