import java.io.DataOutputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.util.*; import java.applet.*; import java.awt.*; import java.io.*; import java.net.*; public class nucleotideReader { //Object Constructor public nucleotideReader(String inputFileName, String outputProteinFile, String outputPercentFile, String outputSequenceFile) { // TODO Auto-generated constructor stub this.inputFileName = inputFileName; this.outputPercentFile = outputPercentFile; this.outputProteinFile = outputProteinFile; this.outputSequenceFile = outputSequenceFile; try{ myLineReader = new LineReader(inputFileName); } catch (FileNotFoundException e) { System.out.println("File " + inputFileName + " does not exist\n"); e.printStackTrace(); } try{ //myOutputFile = new FileWriter("outputOfProgram.html"); proteinFile = new FileWriter(outputProteinFile); percentFile = new FileWriter(outputPercentFile); sequenceFile = new FileWriter(outputSequenceFile); } catch (Exception e) { System.out.println("Couldn't Create Output File"); } myArrayList = new ArrayList(); } /* * Below we will create the methods for this class/object * The first three methods are called "accessors" * They allow someone using this class to access the data * that this class stores. * Each of the following accessors is publicly available * and returns a string. */ public String getInputFileName() { return inputFileName; } public String getOutputProteinFile(){ return outputProteinFile; } public String getOutputPercentFile(){ return outputPercentFile; } /*************************************** * This method, called find protein, * takes a line from the output HTML * which contains the protein name, but * also contains many garbage characters * that contain the URL linking that exists * on the main webpage. * What we do is attempt to find the first point * where the protein name is actually written - * that being ref|Y etc or gb| etc - we are assuming * that first aspect only has 2 or three characters with * a | after it. * So, we search for the end of an HTML tag, being a > * and then look for a | to show that the HTML is done. * What we do is take a substring by eliminating the first * character one at a time and replacing it. * Admittedly this is inefficient but in the interest of getting * it to work we don't really care. * After we find the find that aspect, we copy the information * into the resulting Protein string. * We then look for the next part which is actually the protein name. * That is the information at the end of the result string * that follows the tag of "Gene Info" * So we trim the result string until we find Gene Info - * which we know the location of based on the built in "index of" function. * @param results * @return */ private String findProtein(String results){ String protein = ""; //System.out.println(results); while(!( (results.charAt(0) == '>') && ((results.charAt(3) == '|') || (results.charAt(4) == '|'))) ){ results = results.substring(1); } results = results.substring(1); while(results.charAt(0) != '<'){ protein = protein + results.charAt(0); results = results.substring(1); } int i = 0; int location = results.indexOf("Gene info"); while(i <= location){ results = results.substring(1); i = i + 1; } int j = 0; while(j < 2) { if (results.charAt(0) == '>') j = j + 1; results = results.substring(1); } protein = protein + results; return protein; } /*********************************************** * This works very similarly to the find Protein function * However, its quite simple, because there isn't all the * HTML syntax to go through. All we need to * Is eliminate the Identity = at the beginning, * and stop copying after the , from the first results. * * @param results * @return */ private String findPercent(String results){ String percent = ""; while(results.charAt(0) != '=') { results = results.substring(1); } results = results.substring(2); while(results.charAt(0) != ',') { percent = percent + results.charAt(0); results = results.substring(1); } return percent; } /* * The Run method below here will be doing the actual work * of reading the files, talking to the website * the reading from the website, and finally, finishing by * writing to the two output files. * Doesn't return anything, or take any inputs. */ private BufferedReader sendRequestForData(String RID, HttpURLConnection connection, HttpURLConnection getConnection) { //SET UP THE RESULTS REQUEST String getOutputData = ""; try{ getOutputData = getOutputData + "CMD=" + URLEncoder.encode("Get", "UTF-8"); getOutputData = getOutputData + "&RID=" + URLEncoder.encode(RID, "UTF-8"); getOutputData = getOutputData + "&ALIGNMENTS=" + URLEncoder.encode("500", "UTF-8"); getOutputData = getOutputData + "&ALIGNMENT_VIEW=" + URLEncoder.encode("Pairwise", "UTF-8"); getOutputData = getOutputData + "&DESCRIPTIONS=" + URLEncoder.encode("500", "UTF-8"); //getOutputData = getOutputData + "&ENTREZ_LINKS_NEW_WINDOW=" + URLEncoder.encode("empty", "UTF-8"); //getOutputData = getOutputData + "&FORMAT_TYPE=" + URLEncoder.encode("Text", "UTF-8"); //COULD MAKE THAT HTML //OUTPUT THE REQUEST FOR RESULTS DataOutputStream getOutputToPage = new DataOutputStream(getConnection.getOutputStream()); System.out.println(getOutputData); getOutputToPage.writeBytes(getOutputData); getOutputToPage.flush(); getOutputToPage.close(); //CREATE A READER TO GET RESULTS FROM THE WEBPAGE BufferedReader inputFromPage = new BufferedReader(new InputStreamReader(getConnection.getInputStream())); return inputFromPage; }catch (Exception e){ System.out.println("exception"); return null; } } public void run(){ int count = 1; myArrayList = myLineReader.readLine(); if(myArrayList == null){ return; } do{ //Below is the Code for Accessing the Website with the given Nucleotide Line. try { URL url = new URL("http://www.ncbi.nlm.nih.gov/BLAST/Blast.cgi"); StringBuffer sequence = new StringBuffer(); //String sequence = myArrayList.toString(); for (int j = 0; j < myArrayList.size(); j++) { sequence.append(myArrayList.get(j).toString()); } System.out.println(sequence.toString()); HttpURLConnection connection = (HttpURLConnection)url.openConnection(); connection.setDoOutput(true); connection.setDoInput(true); connection.setRequestMethod("POST"); connection.setUseCaches(false); connection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded"); DataOutputStream outputToPage = new DataOutputStream(connection.getOutputStream()); String outputData = ""; //ENCODING DATA FOR SUBMISSION outputData = outputData + "CMD="/*URLEncoder.encode("CMD=", "UTF-8")*/ + URLEncoder.encode("Put", "UTF-8"); outputData = outputData + "&QUERY="/*URLEncoder.encode("&QUERY=", "UTF-8")*/ + URLEncoder.encode(sequence.toString(), "UTF-8"); //This line may not be necessary - line below that is. //outputData = outputData + URLEncoder.encode("&db=", "UTF-8")*/ + URLEncoder.encode("nucleotide", "UTF-8"); outputData = outputData + "&QUERY_FROM="/*URLEncoder.encode("&QUERY_FROM=", "UTF-8")*/ + URLEncoder.encode("0", "UTF-8"); outputData = outputData + "&QUERY_TO="/*URLEncoder.encode("&QUERY_TO=", "UTF-8")*/ + URLEncoder.encode("0", "UTF-8"); //May need a better way to show "empty" outputData = outputData + "&QUERY_FILE="/*URLEncoder.encode("&QUERY_FILE=", "UTF-8")*/ + URLEncoder.encode("", "UTF-8"); outputData = outputData + "&GENETIC_CODE="/*URLEncoder.encode("&GENETIC_CODE=", "UTF-8")*/ + URLEncoder.encode("1", "UTF-8"); //outputData = outputData + URLEncoder.encode("&JOB_TITLE=", "UTF-8") + URLEncoder.encode("", "UTF-8"); //May not be necessary //outputData = outputData + URLEncoder.encode("&DBTYPE=", "UTF-8") + URLEncoder.encode("hc", "UTF-8"); //Default is nr... dunno what else is reasonable here - may need new ones each time? outputData = outputData + "&DATABASE="/*URLEncoder.encode("&DATABASE=", "UTF-8")*/ + URLEncoder.encode("nr"/*"Test/gpipe/9606/allcontig_and_rna"*/, "UTF-8"); //outputData = outputData + URLEncoder.encode("&EQ_TEXT=", "UTF-8") + URLEncoder.encode("empty", "UTF-8"); outputData = outputData + "&AUTO_FORMAT="/*URLEncoder.encode("&AUTO_FORMAT=", "UTF-8")*/ + URLEncoder.encode("off", "UTF-8"); outputData = outputData + "&COMPOSITION_BASED_STATISTICS="/*URLEncoder.encode("&COMPOSITION_BASED_STATISTICS=", "UTF-8")*/ + URLEncoder.encode("no", "UTF-8"); //May Not Need outputData = outputData + "&ENDPOINTS="/*URLEncoder.encode("&ENDPOINTS=", "UTF-8")*/ + URLEncoder.encode("no", "UTF-8"); //outputData = outputData + "&ENTREZ_QUERY="/*URLEncoder.encode("&ENTREZ_QUERY=", "UTF-8")*/ + URLEncoder.encode("empty", "UTF-8"); outputData = outputData + "&EXPECT="/*URLEncoder.encode("&EXPECT=", "UTF-8")*/ + URLEncoder.encode("10", "UTF-8"); outputData = outputData + "&GAPCOSTS="/*URLEncoder.encode("&GAPCOSTS=", "UTF-8")*/ + URLEncoder.encode("11 1", "UTF-8"); outputData = outputData + "&HITLIST_SIZE="/*URLEncoder.encode("&HITLIST_SIZE=", "UTF-8")*/ + URLEncoder.encode("500", "UTF-8"); outputData = outputData + "&LAYOUT="/*URLEncoder.encode("&LAYOUT=", "UTF-8")*/ + URLEncoder.encode("OneWindow", "UTF-8"); outputData = outputData + "&LCASE_MASK="/*URLEncoder.encode("&LCASE_MASK=", "UTF-8")*/ + URLEncoder.encode("no", "UTF-8"); //could change to BLOSUM80 outputData = outputData + "&MATRIX_NAME="/*URLEncoder.encode("&MATRIX_NAME=", "UTF-8")*/ + URLEncoder.encode("BLOSUM62", "UTF-8"); outputData = outputData + "&FILTER="/*URLEncoder.encode("&FILTER=", "UTF-8")*/ + URLEncoder.encode("L", "UTF-8"); outputData = outputData + "&WORDSIZE="/*URLEncoder.encode("&WORDSIZE=", "UTF-8")*/ + URLEncoder.encode("3", "UTF-8"); outputData = outputData + "&PROGRAM="/*URLEncoder.encode("&PROGRAM=", "UTF-8")*/ + URLEncoder.encode("blastx", "UTF-8"); outputData = outputData + "&QUERY_BELIEVE_DEFLINE="/*URLEncoder.encode("&QUERY_BELIEVE_DEFLINE=", "UTF-8")*/ + URLEncoder.encode("no", "UTF-8"); //ignore? outputData = outputData + "&SEARCHSP_EFF"/*URLEncoder.encode("&SEARCHSP_EFF=", "UTF-8")*/ + URLEncoder.encode("0", "UTF-8"); outputData = outputData + "&SERVICE="/*URLEncoder.encode("&SERVICE=", "UTF-8")*/ + URLEncoder.encode("plain", "UTF-8"); //outputData = outputData + URLEncoder.encode("&THRESHOLD=", "UTF-8") + URLEncoder.encode("???", "UTF-8"); outputData = outputData + "&UNGAPPED_ALIGNMENT="/*URLEncoder.encode("&UNGAPPED_ALIGNMENT=", "UTF-8")*/ + URLEncoder.encode("no", "UTF-8"); System.out.println(outputData); //WRITE THE INFORMATION TO THE WEBPAGE outputToPage.writeBytes(outputData); outputToPage.flush(); outputToPage.close(); //CREATE A READER TO GET RESULTS FROM THE WEBPAGE BufferedReader inputFromPage = new BufferedReader(new InputStreamReader(connection.getInputStream())); String line; String RID = ""; // STRING THAT WILL CONTAIN ID String RTOE = ""; //STRING THAT WILL CONTAIN RETURN TIME OF ENTRY //Reading Output From Database while((line = inputFromPage.readLine()) != null) { if(line.contains("RID = ")){ RID = line; RID = RID.substring(10); System.out.println(RID); } else if (line.contains("RTOE = ")) { RTOE = line; RTOE = RTOE.substring(11); System.out.println(RTOE); } }//End of this inner while inputFromPage.close(); URL getUrl = new URL("http://www.ncbi.nlm.nih.gov/BLAST/Blast.cgi"); HttpURLConnection getConnection = (HttpURLConnection)getUrl.openConnection(); getConnection.setDoOutput(true); getConnection.setDoInput(true); getConnection.setRequestMethod("GET"); getConnection.setUseCaches(false); getConnection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded"); //END OF PUT - NOW MUST USE GET TO FIND RESULTS //FIRST WE WILL WAIT TWICE AS LONG TO MAKE SURE RESULTS ARE READY long theCurrentTime = System.currentTimeMillis(); Long returnTimeAsLong = new Long(RTOE); //Better than the Thread.sleep idea while(System.currentTimeMillis() <= theCurrentTime + (returnTimeAsLong.longValue()*1000)) { //Do Nothing } //SNED REQUEST HERE //READ THE RESULTS BufferedReader getInputFromPage; getInputFromPage = sendRequestForData(RID, connection, getConnection); int i = 0; String getLine; String finalOutput = ""; String bufferedProtein = ""; boolean atSection = false; boolean ignore = false; boolean beganProtein = false; boolean beganPercent = false; boolean shouldIBreak = false; while((getLine = getInputFromPage.readLine()) != null) { if (getLine.contains("Status=READY")) { shouldIBreak = true; } else if(getLine.contains("Status=WAITING")) { System.out.println("Got Waiting"); theCurrentTime = System.currentTimeMillis(); while(System.currentTimeMillis() <= theCurrentTime + (returnTimeAsLong.longValue()*500)) { //Do Nothing } getInputFromPage.close(); getConnection = (HttpURLConnection)getUrl.openConnection(); getConnection.setDoOutput(true); getConnection.setDoInput(true); getConnection.setRequestMethod("GET"); getConnection.setUseCaches(false); getConnection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded"); getInputFromPage = sendRequestForData(RID, connection, getConnection); } if(shouldIBreak) break; } System.out.println("The Break Worked"); //Reading Output From Database while((getLine = getInputFromPage.readLine()) != null) { //System.out.println(getLine); if(getLine.contains("")){ atSection = true; } if(atSection && getLine.contains("[") && (!beganProtein)) { beganProtein = true; bufferedProtein = findProtein(getLine); if(getLine.endsWith("]")){ proteinFile.write(/*count + ") " + */bufferedProtein + "\r\n"); proteinFile.flush(); ignore = true; } } if(atSection && (i == 1) && (!ignore) ) { bufferedProtein = bufferedProtein + getLine; proteinFile.write(/*count + ") " + */bufferedProtein + "\r\n"); proteinFile.flush(); ignore = true; } //WRITE THE PROTEIN NAME TO THE FILE ABOVE HERE //BELOW - WE WRITE THE IDENTITY RESULTS if(atSection && getLine.contains("Identities") && (!beganPercent)) { beganPercent = true; percentFile.write(/*count + ") " + */findPercent(getLine) + "\r\n"); percentFile.flush(); } if(beganProtein) { i = 1; } //THIS WAS FOR TESTING PURPOSES< WOULD SAVE THE ENTIRE HTML > //finalOutput = getLine + "\r\n"; //myOutputFile.write(finalOutput); //myOutputFile.flush(); }//End of this inner while //END OF RESULTS getInputFromPage.close(); //myOutputFile.write(finalOutput); //myOutputFile.flush(); //myOutputFile.close(); sequenceFile.write("Sequence number " + count + " successfully written\r\n"); sequenceFile.flush(); } catch(IOException e) { e.printStackTrace(); } System.out.println("Finished sequence number " + count); count = count + 1; myArrayList = myLineReader.readLine(); //END of a Sequence } while(myArrayList != null); //IF DONE WITH ALL SEQUENCES try { myLineReader.close(); proteinFile.close(); percentFile.close(); sequenceFile.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } //Object Data private String inputFileName; private String outputProteinFile; private String outputPercentFile; private String outputSequenceFile; private LineReader myLineReader; private ArrayList myArrayList; private FileWriter myOutputFile; private FileWriter proteinFile; private FileWriter percentFile; private FileWriter sequenceFile; }