BioPHP: PHP for Biocomputing


[ Home Page ] [ I/O Scripts Page ]

Source Code Listing of seqdb.inc.php

Note: This is part of BioPHP 1.1 alpha code set. The code, which is approximately 2200+ lines long, is still rough. Improvements to the
code are welcome!
<?php require_once("etc.inc.php"); require_once("seq.inc.php"); // ================== FUNCTIONS ======================== function get_seq_swp($record, $dbformat) { $oSeq = seqdb::parse_swissprot($record); return $oSeq->sequence; } function get_swp_entname($record, $dbformat) { $first_line = $record[0]; $xy = substr($first_line, 5, 10); $xyr = preg_split("/_/", $xy); return ($xyr[0]); } function get_gb_id($record, $dbformat) { /* print "<PRE>"; print_r($record); print "</PRE>"; print "dbformat : " . $dbformat . "<BR>"; */ $first_line = $record[0]; /* print "first_line: " . $first_line; print "<BR>"; */ if ($dbformat == "GENBANK") return trim(substr($first_line, 12, 16)); } // format_mysql_column checks value if its NULL or a string, and returns a version formatted // esp. for mysql. function format_mysqlcol($value) { if ($value == NULL) return 'NULL'; elseif (gettype($value) == "string") { if (strlen(trim($value)) > 0) return ("'" . addslashes($value) . "'"); else return 'NULL'; } elseif (is_numeric($value) == TRUE) return $value; else return $value; // later, add more ifs (for boolean values, etc.) } function topo_code($topo_word) { $topo_word = trim(strtoupper($topo_word)); if ($topo_word == "LINEAR") return "L"; elseif ($topo_word== "CIRCULAR") return "C"; } function topo_word($topo_code) { $topo_code = trim(strtotupper($topo_code)); if ($topo_code == "L") return "LINEAR"; elseif ($topo_code == "C") return "CIRCULAR"; } function chg_firstkey(&$value, $key) { // $value is array("CDS" => "1..901", "/gene" => "etc", ... ) $first = array_slice($value, 0, 1); // $first is array("CDS" => "1..901") $firstkey = array_keys($first); $firstvalue = array_values($first); // $firstkey = array("CDS"); // $firstvalue = array("1..901"); $location = $firstvalue[0]; array_shift($value); // $value is array("/gene" => "etc", ... ) $newarray = array_merge(array("LOCATION" => $location), $value); $value = $newarray; } function monthno($monstr) { $monstr = strtoupper($monstr); switch($monstr) { case "JAN": return "01"; case "FEB": return "02"; case "MAR": return "03"; case "APR": return "04"; case "MAY": return "05"; case "JUN": return "06"; case "JUL": return "07"; case "AUG": return "08"; case "SEP": return "09"; case "OCT": return "10"; case "NOV": return "11"; case "DEC": return "12"; } } // parse_genbank() parses a GenBank data file and returns a Seq object containing parsed data. function parse_genbank($flines, $sql_db = "NONE") { $seqarr = array(); $inseq_flag = false; $seqdata_flag = false; $accession_flag = false; $ref_array = array(); $feature_array = array(); $entry_ctr = 0; $ref_ctr = 0; $maxlength = 0; $minlength = 999999; $tot_seqlength = 0; // May 22, 2003: Added the lines below to handle multi-line SOURCE entries. $in_source_flag = FALSE; $source_string = ""; // May 22, 2003: Added the lines below to handle multi-line ORGANISM entries (w/c is a // subrecord under SOURCE, 1st line is species, the next lines are kingdom; phylum; etc.) $aTaxonomy = array(); $tax_string = ""; $in_organism_flag = FALSE; // May 22, 2003: Added the lines below to handle multi-line KEYWORD entries. $wordarray = array(); $keywords_string = ""; $in_keywords_flag = FALSE; while( list($lineno, $linestr) = each($flines) ) { // OPENS outermost while( list($lineno, $linestr) = each($flines) ) if (substr($linestr,0,5) == "LOCUS") { $entry_ctr++; $ref_ctr = 0; $ref_array = array(); // This is the beginning of a SEQUENCE ENTRY. $seqdata = ""; // SEQ OBJ VERSION: DELETE THIS ENTIRE SECTION LATER. $seqobj = new seq(); $seqobj->id = trim(substr($linestr, 12, 16)); $tot_seqlength += $seqobj->seqlength; if ($seqobj->seqlength > $maxlength) $maxlength = $seqobj->seqlength; if ($seqobj->seqlength < $minlength) $minlength = $seqobj->seqlength; if (substr($linestr, 44, 3) == "ss-") $seqobj->strands = "SINGLE"; elseif (substr($linestr, 44, 3) == "ds-") $seqobj->strands = "DOUBLE"; elseif (substr($linestr, 44, 3) == "ms-") $seqobj->strands = "MIXED"; $seqobj->topology = strtoupper(substr($linestr, 55, 8)); $seqobj->division = strtoupper(substr($linestr, 64, 3)); // SQL VERSION // PRIME_ACC, ENTRY_NAME, and SEQ_LENGTH $seqobj_id = trim(substr($linestr, 12, 16)); $seqobj_seqlength = trim(substr($linestr, 29, 11)) * 1; // MOL_TYPE: // June 10, 2003: TRIMmed the value of moltype to remove leading/trailing whitespaces. $seqobj_moltype = trim(substr($linestr, 47, 6)); // DATE: Converts from 25-DEC-2002 => 25-12-2002 $seqobj_date = strtoupper(substr($linestr, 68, 11)); if (strlen(trim($seqobj_date)) == 0) $seqobj_date = NULL; else { $day = left($seqobj_date, 2); $mon = monthno(substr($seqobj_date, 3, 3)); $year = right($seqobj_date, 4); $seqobj_date = "$year-$mon-$day"; } // GBSEQUENCE: PRIME_ACC STRANDS TOPOLOGY DIVISION // STRANDS, TOPOLOGY, DIVISION $seqobj_strands = trim(strtoupper(substr($linestr, 44, 2))); $seqobj_topology = topo_code(substr($linestr, 55, 8)); $seqobj_division = strtoupper(substr($linestr, 64, 3)); $inseq_flag = true; } if (substr($linestr,0,10) == "DEFINITION") { $wordarray = explode(" ", $linestr); array_shift($wordarray); // May 21, 2003: Remove trailing/leading blanks from DEFINITION string // by enclosing the expression implode(...) inside a trim() function. $seqobj->definition = trim(implode(" ", $wordarray)); $seqobj_description = implode(" ", $wordarray); } if ($inseq_flag == TRUE) { // OPENS if ($inseq_flag == TRUE) if (trim(substr($linestr, 0, 12)) == "REFERENCE") { // at this point, we are at the line with REFERENCE x (base y of z) in it. $wordarray = preg_split("/\s+/", trim(substr($linestr,12))); $ref_rec = array(); $ref_rec["REFNO"] = $wordarray[0]; array_shift($wordarray); $ref_rec["BASERANGE"] = implode(" ", $wordarray); $ref_baserange = implode(" ", $wordarray); $lastsubkey = ""; $subkey_lnctr = 0; while( list($lineno, $linestr) = each($flines) ) { $subkey = trim(substr($linestr,0,12)); // If current subkey is blank string, then this is a continuation of the last subsection. if (strlen($subkey) == 0) $subkey = $lastsubkey; // If we are at the next subkey section (e.g. lastsubkey was AUTHORS, and current is TITLE). if ($subkey != $lastsubkey) $subkey_lnctr = 0; switch ($subkey) { case "AUTHORS": $subkey_lnctr++; $wordarray = preg_split("/\s+/", trim(substr($linestr,12))); // we remove comma at the end of a name, and the element "and". $newarray = array(); foreach($wordarray as $authname) { if (strtoupper($authname) != "AND") { if (substr($authname, strlen($authname)-1, 1) == ",") $authname = substr($authname, 0, strlen($authname)-1); $newarray[] = $authname; } } if ($subkey_lnctr == 1) $ref_rec["AUTHORS"] = $newarray; else $ref_rec["AUTHORS"] = array_merge($ref_rec["AUTHORS"], $newarray); break; case "TITLE": $subkey_lnctr++; if ($subkey_lnctr == 1) $ref_rec["TITLE"] = trim(substr($linestr,12)); else $ref_rec["TITLE"] .= " " . trim(substr($linestr,12)); break; case "JOURNAL": $subkey_lnctr++; if ($subkey_lnctr == 1) $ref_rec["JOURNAL"] = trim(substr($linestr,12)); else $ref_rec["JOURNAL"] .= " " . trim(substr($linestr,12)); break; case "MEDLINE": $ref_rec["MEDLINE"] = substr($linestr, 12, 8); break; case "PUBMED": $ref_rec["PUBMED"] = trim(substr($linestr, 12)); break; case "REMARK": $subkey_lnctr++; if ($subkey_lnctr == 1) $ref_rec["REMARK"] = trim(substr($linestr,12)); else $ref_rec["REMARK"] .= " " . trim(substr($linestr,12)); break; case "COMMENT": $subkey_lnctr++; if ($subkey_lnctr == 1) $ref_rec["COMMENT"] = trim(substr($linestr,12)); else $ref_rec["COMMENT"] .= " " . trim(substr($linestr,12)); break; } if ($subkey == "FEATURES") { prev($flines); break; } if ($subkey == "REFERENCE") { $ref_ctr++; prev($flines); break; } $lastsubkey = $subkey; } array_push($ref_array, $ref_rec); } if (trim(substr($linestr, 0, 12)) == "SEGMENT") { $seqobj->segment = substr($linestr, 12); $wordarray = preg_split("/\s+/", trim(substr($linestr,12))); $seqobj->segment_no = $wordarray[0]; $seqobj->segment_count = $wordarray[2]; //SQL VERSION $seqobj_segment_no = $wordarray[0]; $seqobj_segment_count = $wordarray[2]; } // SOURCE SOURCE SOURCE SOURCE section if (trim(substr($linestr, 0, 12)) == "SOURCE") { /* For now, assume a single-line SOURCE field. $seqobj->source = substr($linestr, 12); $seqobj_source = substr($linestr, 12); */ // May 22, 2003: I will now make the code capable of handling multiple lines. $source_string .= rtrim(substr($linestr, 12)) . " "; $in_source_flag = TRUE; } elseif ( (substr($linestr,0,12) == " ") and ($in_source_flag) ) { $source_string .= rtrim(substr($linestr,12)) . " "; } elseif ( (substr($linestr,0,12) != " ") and ($in_source_flag) ) { $seqobj->source = trim($source_string); $seqobj_source = trim($source_string); $source_string = ""; $in_source_flag = FALSE; } // For now, assume that KEYWORDS field consists of exactly one line. /* if (trim(substr($linestr, 0, 12)) == "KEYWORDS") { $wordarray = preg_split("/\s+/", trim($linestr)); array_shift($wordarray); $wordarray = preg_split("/;+/", implode(" ", $wordarray)); if ($wordarray[0] != ".") { $seqobj->keywords = $wordarray; $seqobj_keywords = $wordarray; } } */ if (trim(substr($linestr, 0, 12)) == "KEYWORDS") { $keywords_string .= trim(substr($linestr,12)) . " "; // if the keyword in the first line is a exactly a period (.) then // don't bother processing the succeeding (2nd, 3rd) lines. // KEYWORDS property remains NULL (unset) if only a period is found after KEYWORDS. if ($keywords_string == ". ") $in_keywords_flag = FALSE; else $in_keywords_flag = TRUE; } elseif ( (substr($linestr,0,12) == " ") and ($in_keywords_flag) ) { $keywords_string .= trim(substr($linestr,12)) . " "; } elseif ( (substr($linestr,0,12) != " ") and ($in_keywords_flag) ) { // remove leading/trailing whitespaces from $keyword_string $keywords_string = trim($keywords_string); // remove the last character (which is always a period) from $keyword_string. $keywords_string = substr($keywords_string, 0, (strlen($keywords_string)-1)); $wordarray = preg_split("/;/", trim($keywords_string), -1, PREG_SPLIT_NO_EMPTY); // Store result in the KEYWORDS property/attribute of our container SEQ object. // June 9, 2003: Added the line below to trim lead/trailg spaces from keywords. array_walk($wordarray, "trim_element"); $seqobj->keywords = $wordarray; $seqobj_keywords = $wordarray; $keywords_string = ""; $in_keywords_flag = FALSE; } if (substr($linestr, 0, 7) == "VERSION") { // Assume that VERSION line is made up of exactly 2 or 3 tokens. // May 21, 2003: Revised the code here a bit. I instead worked on // the substring of the VERSION starting with the data (not including // the word VERSION itself). /* $wordarray = preg_split("/\s+/", trim($linestr)); $seqobj->version = $wordarray[1]; $seqobj_version = $wordarray[1]; if (count($wordarray) == 3) { $seqobj->ncbi_gi_id = $wordarray[2]; $seqobj_ncbi_gi_id = $wordarray[2]; } $accession_flag = false; */ $wordarray = preg_split("/\s+/", trim(substr($linestr,12))); $seqobj->version = $wordarray[0]; $seqobj_version = $wordarray[0]; if (count($wordarray) == 2) { $seqobj->ncbi_gi_id = $wordarray[1]; $seqobj_ncbi_gi_id = $wordarray[1]; } $accession_flag = false; } if ($accession_flag == TRUE) { // 2nd, 3rd, etc. line of ACCESSION field. $wordarray = preg_split("/\s+/", trim($linestr)); $seqobj_sec_accession = array_merge($seqobj_sec_accession, $wordarray); } if (substr($linestr,0,9) == "ACCESSION") { $wordarray = preg_split("/\s+/", trim($linestr)); $seqobj->accession = $wordarray[1]; // SQL VERSION $seqobj_accession = $wordarray[1]; array_shift($wordarray); array_shift($wordarray); $seqobj->sec_accession = $wordarray; $seqobj_sec_accession = $wordarray; $accession_flag = true; } if (substr($linestr,0,10) == " ORGANISM") { $seqobj->organism = substr($linestr,12); $seqobj_organism = substr($linestr,12); $in_organism_flag = TRUE; } elseif ( (substr($linestr,0,12) == " ") and ($in_organism_flag) ) { $tax_string .= substr($linestr,12); } elseif ( (substr($linestr,0,12) != " ") and ($in_organism_flag) ) { // NOTE: For now, we just dump everything into one simple, numbered array. // Later, we can convert this into an associative array with keys like // "KINGDOM", "PHYLUM", etc. // remove leading/trailing whitespaces $tax_string = trim($tax_string); // remove the last character which is always a period (.) $tax_string = substr($tax_string, 0, (strlen($tax_string)-1)); $aTaxonomy = preg_split("/;/", trim($tax_string), -1, PREG_SPLIT_NO_EMPTY); array_walk($aTaxonomy, "trim_element"); $seqobj->taxonomy = $aTaxonomy; $seqobj_taxonomy = $aTaxonomy; $aTaxonomy = array(); $tax_string = ""; $in_organism_flag = FALSE; } if (($seqdata_flag == true) && (substr($linestr,0,2) != "//")) { $wordarray = explode(" ", trim($linestr)); array_shift($wordarray); $seqline = implode("", $wordarray); $seqdata .= $seqline; } if (substr($linestr,0,6) == "ORIGIN") $seqdata_flag = true; if (substr($linestr,0,2) == "//") { $seqobj->sequence = $seqdata; $seqarr[$this->id] = $this; $seqdata_flag = false; $inseq_flag = false; break; } } if (trim(substr($linestr,0,10)) == "BASE COUNT") { if (count($feat_r) > 0) { $seqobj->features = $feat_r; $seqobj_features = $feat_r; } } if (trim(substr($linestr,0,12)) == "FEATURES") { // OPENS if (trim(substr($linestr,0,12)) == "FEATURES") // The REFERENCE section was present for this SEQUENCE ENTRY so we set REFERENCE attribute. if (count($ref_array) > 0) $seqobj->reference = $ref_array; $lastsubkey = ""; $feat_r = array(); $qual_r = array(); // Go to the next line. list($lineno, $linestr) = each($flines); // June 10, 2003: Assume feature keys do not repeat (or are unique) within a GenBank record. // But I suspect repeats are allowed by GB. This is a reminder to change feature array later. // This loops through each line in the entire FEATURES SECTION. while( substr($linestr,0,10) != "BASE COUNT" ) { // FEATURES WHILE LOOP $label = trim(substr($linestr,0,21)); $data = trim(substr($linestr,21)); if (strlen($label) != 0) { // At the beginning of a new SUBKEY. $subkey = $label; // Add/save the qualifier array (qual_r) of the previous SUBKEY to our big feat_r array. if (count($qual_r) > 0) { $feat_r[$lastsubkey] = $qual_r; $qual_r = array(); } $qual = $subkey; $qual_r[$qual] = ""; $qual_ctr = 0; do { // QUALIFIER WHILE LOOP $qual_ctr++; $qual_r[$qual] .= " " . $data; list($lineno, $linestr) = each($flines); $label = trim(substr($linestr,0,21)); $data = trim(substr($linestr,21)); } while( is_blank($label) and !(isa_qualifier($data)) ); $qual_r[$qual] = trim($qual_r[$qual]); if (!(is_blank($label))) { $lastsubkey = $subkey; $subkey = $label; } } else { // we are inside a subkey section but on the 2nd, 3rd, nth line which have blank LABELS. if (isa_qualifier($data)) { $wordarray = preg_split("/=/", $data); $qual = $wordarray[0]; if (count($wordarray) == 1) $data = " "; else $data = $wordarray[1]; $qual_r[$qual] = ""; $qual_ctr = 0; do { // QUALIFIER WHILE LOOP $qual_ctr++; $qual_r[$qual] .= " " . $data; list($lineno, $linestr) = each($flines); $label = trim(substr($linestr,0,21)); $data = trim(substr($linestr,21)); } while( is_blank($label) and !(isa_qualifier($data)) ); if (count($wordarray) > 1) $qual_r[$qual] = trim($qual_r[$qual]); if ($qual == "/translation") $qual_r[$qual] = intrim($qual_r[$qual]); if (!(is_blank($label))) { $lastsubkey = $subkey; $subkey = $label; } } } // ELSE PART of if (strlen($subkey) != 0) } // FEATURES WHILE LOOP if (count($qual_r) > 0) { $feat_r[$lastsubkey] = $qual_r; $qual_r = array(); } array_shift($feat_r); array_walk($feat_r, "chg_firstkey"); prev($flines); } // CLOSES if (trim(substr($linestr,0,12)) == "FEATURES") } // CLOSES outermost while( list($lineno, $linestr) = each($flines) ) if ($sql_db == "NONE") { $seqobj->moltype = $seqobj_moltype; $seqobj->seqlength = $seqobj_seqlength; $seqobj->date = $seqobj_date; $seqobj->strands = $seqobj_strands; $seqobj->topology = $seqobj_topology; $seqobj->division = $seqobj_division; $seqobj->seqarray = $seqarr; return $seqobj; } // START OF EXPORT TO SQL ================================= // Open the database. $dbid = mysql_pconnect("localhost", "", ""); if ($dbid == FALSE) die("Cannot open GenePHP database"); mysql_select_db($sql_db) or die("Cannot select GenePHP database!"); // SEQUENCE: PRIME_ACC ENTRY_NAME SEQ_LENGTH MOL_TYPE DATE SOURCE SEQUENCE // For now, we assume that LOCUS = PRIME_ACC. // Later, check the ARRAYS if they are set and non-empty. I've done this for KEYWORDS. $seqlen = format_mysqlcol($seqobj_seqlength); $moltyp = format_mysqlcol($seqobj_moltype); $date = format_mysqlcol($seqobj_date); $source = format_mysqlcol($seqobj_source); $seqdat = format_mysqlcol($seqdata); $desc = format_mysqlcol($seqobj_description); $organi = format_mysqlcol($seqobj_organism); $sql = "INSERT INTO sequence VALUES('$seqobj_accession', '$seqobj_id', $seqlen, $moltyp, $date, $source, $seqdat, $desc, $organi)"; $res = mysql_query($sql); if ($res == FALSE) die("INSERT INTO SEQUENCE query failed!<BR>$sql"); else print "Sequence query okay<BR>"; // GBSEQUENCE table: $stra = format_mysqlcol($seqobj_strands); $topo = format_mysqlcol($seqobj_topology); $divi = format_mysqlcol($seqobj_division); $segno = format_mysqlcol($seqobj_segment_no); $segct = format_mysqlcol($seqobj_segment_count); $versi = format_mysqlcol($seqobj_version); $ncbid = format_mysqlcol($seqobj_ncbi_gi_id); $sql = "INSERT INTO gbsequence VALUES('$seqobj_accession', $stra, $topo, $divi, $segno, $segct, $versi, $ncbid)"; $res = mysql_query($sql); if ($res == FALSE) die("INSERT INTO GBSEQUENCE query failed!<BR>$sql"); // ACCESSION table: foreach($seqobj_sec_accession as $sec_acc) { $sql = "INSERT INTO accession VALUES('$seqobj_accession', '$sec_acc')"; $res = mysql_query($sql); if ($res == FALSE) die("INSERT INTO ACCESSION query failed!<BR>$sql"); else print "Accession query okay<BR>"; } // KEYWORDS table: if ((isset($seqobj_keywords)) and (count($seqobj_keywords) >= 1)) { foreach($seqobj_keywords as $kword) { $sql = "INSERT INTO keywords VALUES('$seqobj_accession', '$kword')"; $res = mysql_query($sql); if ($res == FALSE) die("INSERT INTO KEYWORD query failed!<BR>$sql"); else print "Keyword query okay<BR>"; } } else print "KEYWORDS array is empty!<BR>"; // REFERENCES and related child tables $ctr = 0; foreach($ref_array as $ref_rec) { $title = format_mysqlcol($ref_rec["TITLE"]); $medline = format_mysqlcol($ref_rec["MEDLINE"]); $pubmed = format_mysqlcol($ref_rec["PUBMED"]); $remark = format_mysqlcol($ref_rec["REMARK"]); $journal = format_mysqlcol($ref_rec["JOURNAL"]); $sql = "INSERT INTO reference VALUES('$seqobj_accession', " . $ref_rec["REFNO"] . ", '" . $ref_rec["BASERANGE"] . "', $title, $medline, $pubmed, $remark, $journal)"; $res = mysql_query($sql); if ($res == FALSE) die("INSERT INTO REFERENCES query failed!<BR>$sql"); else print "References query okay<BR>"; foreach($ref_rec["AUTHORS"] as $author) { $sql = "INSERT INTO authors VALUES('$seqobj_accession', " . $ref_rec["REFNO"] . ", '$author')"; $res = mysql_query($sql); if ($res == FALSE) die("INSERT INTO AUTHORS query failed!"); else print "Authors query okay<BR>"; } $ctr++; } foreach($feat_r as $feat_key => $feat_qual) { foreach($feat_qual as $qual => $qual_value) { if ($qual == "LOCATION") { $qual = "'" . $qual . "'"; $qual_value = "'$qual_value'"; } else { $qual = format_mysqlcol($qual); $qual_value = format_mysqlcol($qual_value); } // We assume that numeric qualifier values only occupy one line (a reasonable assumption). $sql = "INSERT INTO gbfeatures VALUES('$seqobj_accession', '$feat_key', $qual, $qual_value)"; $res = mysql_query($sql); if ($res == FALSE) die("INSERT into GBFEATURES query failed!<BR>$sql"); else print "Features query okay<BR>"; } } // END OF EXPORT TO SQL ================================================== $seqobj->seqarray = $seqarr; return $seqobj; } // Closes parse_genbank() constructor function definition /* var $sequence = ""; var $sec_accession = array(); // array var $keywords = array(); var $taxonomy = array(); // array var $reference = array(); // array var $features = array(); // array */ function oSeq2r($oSeq) { $aSeq = array(); // later, we can shortcut this by using an EVAL() like in htmlform2oSeq() function. $aSeq["PRIM_ACCNO"] = $oSeq->accession; $aSeq["ENTRY_NAME"] = $oSeq->id; $aSeq["MOLTYPE"] = $oSeq->moltype; $aSeq["SEQLENGTH"] = $oSeq->seqlength; $aSeq["ENTRY_DATE"] = $oSeq->date; $aSeq["TOPOLOGY"] = $oSeq->topology; $aSeq["DIVISION"] = $oSeq->division; $aSeq["STRANDS"] = $oSeq->strands; $aSeq["DEFINITION"] = $oSeq->definition; $aSeq["VERSION"] = $oSeq->version; $aSeq["NCBI_GI_ID"] = $oSeq->ncbi_gi_id; $aSeq["SEGMENT_NO"] = $oSeq->segment_no; $aSeq["SEGMENT_COUNT"] = $oSeq->segment_count; $aSeq["SOURCE"] = $oSeq->source; $aSeq["ORGANISM"] = $oSeq->organism; $aSeq["SEQUENCE"] = trim($oSeq->sequence); $aSeq["SEC_ACCESSIONS"] = $oSeq->sec_accession; $aSeq["KEYWORDS"] = $oSeq->keywords; return $aSeq; } // Convention: Name of form objects should be exactly the same as the name of // Seq class attributes/properties. function htmlform2oSeq($vars) { $oSeq = new seq(); $cl_vars = get_class_vars("seq"); $class_properties = array_keys($cl_vars); foreach($vars as $key => $value) { $php_stmt = "\$oSeq->$key = \$value"; if ( in_array($key, $class_properties) ) eval("$php_stmt;"); // NOTE: Here, we assign 'sec_accession' to the SELECT object in the calling form. If user // selects ALL options, 'sec_accession' will contain all numbers incl. primary acc no. if ($key == "sec_accession") { $oSeq->accession = $value[0]; array_shift($value); $oSeq->sec_accession = $value; } } /* REFERENCE data field Example: REFERENCE 1 (bases 1 to 1037383) Output: ( (REFNO => 1, BASERANGE => (bases 1 to 10), AUTHORS => (Gregorio,S.E., Cruz,B.E.), TITLE => "Mol micro", JOURNAL => "Appl. Environ.", MEDLINE => 12345, PUBMED => 12345), (REFNO => 2, ... ) ) FEATURES data field Output: ( SOURCE => ( LOCATION => "1..252", "/organism" => \"unidentified soil organism R6-11\", ... ), rRNA => ( LOCATION => , ....) ) See end of this file for a sample Feature Section of a GenBank record. */ $refctr = $vars["refctr"]; $aRefs = array(); for($i = 1; $i <= $refctr; $i++) { $aRef = array(); $aRef["REFNO"] = $vars["refno$i"]; $aRef["BASERANGE"] = $vars["basefrom$i"]; $aRef["AUTHORS"] = $vars["authors$i"]; $aRef["TITLE"] = $vars["title$i"]; $aRef["MEDLINE"] = $vars["medline$i"]; $aRef["PUBMED"] = $vars["pubmed$i"]; $aRef["REMARK"] = $vars["remark$i"]; $aRef["JOURNAL"] = $vars["journal$i"]; $aRefs[] = $aRef; } $oSeq->reference = $aRefs; $aFeats = array(); $featctr = $vars["featctr"]; for($j = 1; $j <= $featctr; $j++) { $feature_key = $vars["feat_key$j"]; $aFeat = array(); for($k = 1; $k > 0; $k++) // infinite loop coz $k would always be greater than 0. { $q_varname = "qual$j" . "_$k"; $qv_varname = "qualval$j" . "_$k"; if ( isset($vars[$q_varname]) ) $aFeat[$vars[$q_varname]] = stripslashes($vars[$qv_varname]); else { // June 10, 2003: Assume feature keys do not repeat (or are unique) within a GenBank record. // But I suspect repeats are allowed by GB. This is a reminder to change feature array later. $aFeats[$feature_key] = $aFeat; break; } } } $oSeq->features = $aFeats; return $oSeq; } // Accepts a Seq object and returns an array of strings. Each string is an SQL INSERT statement. function insert_na_gb($oSeq) { $aSQL_Stmt = array(); // SQL for SEQUENCE table: PRIME_ACC ENTRY_NAME SEQ_LENGTH MOL_TYPE DATE SOURCE SEQUENCE $accession = $oSeq->accession; $entry_name = $oSeq->id; $seqlen = format_mysqlcol($oSeq->seqlength); $moltyp = format_mysqlcol($oSeq->moltype); $date = format_mysqlcol($oSeq->date); $source = format_mysqlcol($oSeq->source); $seqdat = format_mysqlcol($oSeq->sequence); $desc = format_mysqlcol($oSeq->definition); $organi = format_mysqlcol($oSeq->organism); $sql = "INSERT INTO sequence VALUES('$accession', '$entry_name', " . "$seqlen, $moltyp, $date, $source, $seqdat, $desc, $organi)"; $aSQL_Stmt[] = $sql; // SQL for GBSEQUENCE table: STRANDS, TOPOLOGY, DIVISION, etc. $stra = format_mysqlcol($oSeq->strands); $topo = format_mysqlcol($oSeq->topology); $divi = format_mysqlcol($oSeq->division); $segno = format_mysqlcol($oSeq->segment_no); $segct = format_mysqlcol($oSeq->segment_count); $versi = format_mysqlcol($oSeq->version); $ncbid = format_mysqlcol($oSeq->ncbi_gi_id); $sql = "INSERT INTO gbsequence VALUES('$accession', $stra, $topo, $divi, $segno, $segct, $versi, $ncbid)"; $aSQL_Stmt[] = $sql; // ACCESSION table: if (count($oSeq->sec_accession) > 0) { foreach($oSeq->sec_accession as $sec_acc) { $sql = "INSERT INTO accession VALUES('$accession', '$sec_acc')"; $aSQL_Stmt[] = $sql; } } // KEYWORDS table: for some reason, if the KEYWORDS SELECT listbox is blank in parse_na_gb_res.php, // its value becomes an array with one element, the blank string ''. if ( (isset($oSeq->keywords)) and (count($oSeq->keywords) >= 1) and ($oSeq->keywords[0] != "") ) { foreach($oSeq->keywords as $kword) { $sql = "INSERT INTO keywords VALUES('$accession', '$kword')"; $aSQL_Stmt[] = $sql; } } // REFERENCES and related child tables // $ctr = 0; - This doesn't seem to be doing anything useful. foreach($oSeq->reference as $ref_rec) { $title = format_mysqlcol($ref_rec["TITLE"]); $medline = format_mysqlcol($ref_rec["MEDLINE"]); $pubmed = format_mysqlcol($ref_rec["PUBMED"]); $remark = format_mysqlcol($ref_rec["REMARK"]); $journal = format_mysqlcol($ref_rec["JOURNAL"]); $sql = "INSERT INTO reference VALUES('$accession', " . $ref_rec["REFNO"] . ", '" . $ref_rec["BASERANGE"] . "', $title, $medline, $pubmed, $remark, $journal)"; $aSQL_Stmt[] = $sql; if ( (gettype($ref_rec["AUTHORS"]) == "array") and (count($ref_rec["AUTHORS"]) > 0) ) { foreach($ref_rec["AUTHORS"] as $author) { $sql = "INSERT INTO authors VALUES('$accession', " . $ref_rec["REFNO"] . ", '$author')"; $aSQL_Stmt[] = $sql; } } // $ctr++; - This doesn't seem to be doing anything useful. } $feat_r = $oSeq->features; foreach($feat_r as $feat_key => $feat_qual) { /* June 10, 2003: Are there feature subkeys without qualifier-value pairs? E.g. FEATURES Location/Qualifiers source 1..252 rRNA 252..256 If yes, then you'd have to put an IF statement here so as not to get an error that says, INVALID/EMPTY ARGUMENT to FOREACH. - Serge */ foreach($feat_qual as $qual => $qual_value) { if ($qual == "LOCATION") { $qual = "'" . $qual . "'"; $qual_value = "'$qual_value'"; } else { $qual = format_mysqlcol($qual); $qual_value = format_mysqlcol($qual_value); } // We assume that numeric qualifier values only occupy one line (a reasonable assumption). $sql = "INSERT INTO gbfeatures VALUES('$accession', '$feat_key', $qual, $qual_value)"; $aSQL_Stmt[] = $sql; } } return $aSQL_Stmt; } /* We begin by describing parse_swissprot() first. parse_swissprot() parses the Feature Table lines (those that begin with FT) in a Swissprot data file, extracts the feature key name, from endpoint, to endpoint, and description, and stores them in a (simple) array. process_ft() then pushes this array into a larger associative array, called $swiss, which is also an attribute of the Seq object. It is assigned a key of the form: FT_<feature_key_name>. Examples are: FT_PEPTIDE, FT_DISULFID. */ function process_ft(&$swiss, $ft_r) { foreach($ft_r as $element) { $index = "FT_" . $element[0]; array_shift($element); if (count($swiss[$index]) == 0) { $swiss[$index] = array(); array_push($swiss[$index], $element); } else array_push($swiss[$index], $element); } } // at_entrystart() tests if the file pointer is at the start of a new sequence entry. function at_entrystart($linestr, $dbformat) { if ($dbformat == "GENBANK") return (substr($linestr,0,5) == "LOCUS"); elseif ($dbformat == "SWISSPROT") return (substr($linestr,0,2) == "ID"); } // at_entryend() tests if the file pointer is at the end of a sequence entry. function at_entryend($linestr, $dbformat) { return (left(trim($linestr),2) == "//"); } // get_entryid() gets the primary accession number of the sequence entry which we are // currently processing. This uniquely identifies a sequence entry. function get_entryid(&$flines, $linestr, $dbformat) { if ($dbformat == "GENBANK") return trim(substr($linestr, 12, 16)); elseif ($dbformat == "SWISSPROT") { list($lineno, $linestr) = each($flines); if (substr($linestr,0,2) == "AC") { $words = preg_split("/;/", intrim(substr($linestr,5))); prev($flines); return $words[0]; } } } // line2r() copies the lines belonging to a single sequence entry into an array. // genbank2sql() transfers data from GenBank files listed in the // $gbfile_r array into a MySQL database named $sql_db. function genbank2sql($gbfile_r, $sql_db) { foreach($gbfile_r as $fname) { $fp = fopen($fname, "r"); if ($fp == FALSE) die("Cannot open $fname!"); $flines = array(); while(1) { $linestr = fgets($fp, 101); if (feof($fp) == TRUE) break; $flines[] = $linestr; if (left($linestr,2) == '//') { parse_genbank($flines, $sql_db); $flines = array(); } } fclose($fp); } return FALSE; } // May 12, 2003: REMINDER: Method for checking EOR marker is specific to file formats. // Make it more general. Give user more power/flexibility in determining/specifying // EOR markers. // May 5, 2003: Added the $eor_marker parameter to accomodate KEGG (and other) database. // line2r() copies the lines belonging to a single sequence entry into an array. function line2r($fpseq, $eor_marker = '//') { $flines = array(); while(1) { // $linestr = fgets($fpseq, 101); $linestr = fgets($fpseq, 141); $flines[] = $linestr; // if (left($linestr,2) == $eor_marker) return $flines; $len_eor = strlen($eor_marker); if ( left($linestr,$len_eor) == $eor_marker ) return $flines; } return FALSE; } // May 18, 2003: Added this function. It converts a string array into an array of strings/lines. // At this point, assume that $var contains EXACTLY ONE record of any type (e.g. Genbank, etc). function var2r($var) { $flines = preg_split("/\n/",$var, -1, PREG_SPLIT_NO_EMPTY); if (count($flines) == 0) return FALSE; else return $flines; } // isa_qualifier() tests if the file pointer is at a line containing a feature qualifier. // This applies only to GenBank sequence files. function isa_qualifier($str) { if (firstchar($str) == '/') return true; else return false; } // fseekline() gets the byte offset (from beginning of file) of a particular line. The file is // identified by $fp file pointer, while the line is identified by $lineno, which is zero-based. function fseekline($fp, $lineno) { $linectr = 0; fseek($fp, 0); while(!feof($fp)) { $linestr = fgets($fp,101); if ($linectr == $lineno) { fseek($fp, $byteoff); return $byteoff; } $linectr++; $byteoff = ftell($fp); } } // bsrch_tabfile() searches for a particular sequence id ($seqid) within an *.IDX file // (identified by $fp file pointer), and returns data located in its $col-th column. function bsrch_tabfile($fp, $col, $seqid) { $linectr = 0; fseek($fp, 0); while(!feof($fp)) { fgets($fp, 41); $linectr++; } $lastline = $linectr; rewind($fp); if ($fp == FALSE) die("CANT OPEN FILE"); $searchspace = $lastline; $floor = 0; $ceiling = $lastline - 1; while(1) { $offset = ((int) ($searchspace/2)); $lineno = $floor + $offset; fseekline($fp, $lineno); $word = preg_split("/\s+/", trim(fgets($fp,81))); if ($word[$col] == $seqid) { $word[] = $lineno; return $word; } elseif ($seqid > $word[$col]) { $floor = $lineno + 1; $searchspace = $ceiling - $floor + 1; if ($searchspace <= 0) return FALSE; } else { $ceiling = $lineno - 1; $searchspace = $ceiling - $floor + 1; if ($searchspace <= 0) return FALSE; } } // fclose($fpidx); } // ================== CLASSES ======================== class SeqDB { // OPENS definition of SEQDB CLASS. var $dbname; var $data_fn; var $data_fp; var $dir_fn; var $dir_fp; var $seqptr; var $seqcount; var $dbformat; var $bof; var $eof; // We need the functions bof() and eof() to determine if we've reached the end of // file or not. // Two ways of doing this: 1) examine value of seqptr, or 2) maintain boolean variables eof and bof // first() positions the sequence pointer (i.e. the seqptr property of a Seq object) to // the first sequence in a database (SeqDB object). function first() { $this->seqptr = 0; } // last() positions the sequence pointer (i.e. the seqptr property of a Seq object) to // the last sequence in a database (SeqDB object). function last() { $this->seqptr = $this->seqcount-1; } // prev() (short for previous) positions the sequence pointer (i.e. the seqptr property of // a Seq object) to the sequence that comes before the current sequence. function prev() { if ($this->seqptr > 0) $this->seqptr--; else $this->bof = TRUE; } // next() positions the sequence pointer (i.e. the seqptr property of a Seq object) to the // sequence that comes after the current sequence. function next() { if ($this->seqptr < $this->seqcount-1) $this->seqptr++; else $this->eof = TRUE; } // fetch() retrieves all data from the specified sequence record and returns them in the // form of a Seq object. This method invokes one of several parser methods. function fetch() { if ($this->data_fn == "") die("Cannot invoke fetch() method from a closed object."); @$seqid = func_get_arg(0); // IDX and DIR files remain open for the duration of the FETCH() method. $fp = fopen($this->data_fn, "r"); /* print "DIR FN: ". $this->dir_fn; print "<BR>"; */ $fpdir = fopen($this->dir_fn, "r"); if ($seqid != FALSE) { $idx_r = bsrch_tabfile($fp, 0, $seqid); if ($idx_r == FALSE) return FALSE; else $this->seqptr = $idx_r[3]; } else { // For now, SEQPTR determines CURRENT SEQUENCE ID. Alternative is to track curr line. fseekline($fp, $this->seqptr); $idx_r = preg_split("/\s+/", trim(fgets($fp, 81))); } /* print "SEQ ID: ". $idx_r[0]; print "<BR>"; print "FILE ID: " . $idx_r[1]; print '<BR>'; */ $dir_r = bsrch_tabfile($fpdir, 0, $idx_r[1]); /* print "DIR_R TYPE: " . gettype($dir_r); print "<BR>"; if ($dir_r == FALSE) print "FALSE"; else print $dir_r; print "<BR>"; print $dir_r[0]; print "<BR>"; print $dir_r[1]; print "<BR>"; die(); */ $fpseq = fopen($dir_r[1], "r"); fseekline($fpseq, $idx_r[2]); $flines = line2r($fpseq); $myseq = new seq(); if ($this->dbformat == "GENBANK") $myseq = $this->parse_id($flines); elseif ($this->dbformat == "SWISSPROT") $myseq = $this->parse_swissprot($flines); fclose($fp); fclose($fpdir); fclose($fpseq); return $myseq; } // parse_swissprot() parses a Swissprot data file and returns a Seq object containing parsed data. function parse_swissprot($flines) { // OPENS parse_swissprot() function $accession = array(); $date_r = array(); $desc = ""; $desc_lnctr = 0; $gename_r = array(); $os_r = array(); $os_linectr = 0; $os_str = ""; $oc_linectr = 0; $oc_str = ""; $ref_r = array(); $ra_r = array(); $ra_ctr = 0; $ra_str = ""; $rl_ctr = 0; $rl_str = ""; $db_r = array(); $ft_r = array(); $kw_str = ""; $kw_r = array(); $cc_string = ""; $in_cc_flag = FALSE; $aComments = array(); while ( list($no, $linestr) = each($flines) ) { // OPENS 1st (outermost) while ( list($no, $linestr) = each($flines) ) $linelabel = left($linestr, 2); $linedata = trim(substr($linestr, 5)); $lineend = right($linedata, 1); // May 20, 2003: Added this IF statement to handle CC (COMMENT) lines. // CC - COMMENTS data field. Freetext. Entries may be subdivided into TOPICS. // For now, ignore topics and just assume it's one long string. // I placed this at the TOP (ahead of REFERENCES or RN section) to avoid // complications brought about by the call to PREV() inside RN. if ($linelabel == "CC") { if (left($linedata,3) == '-!-') { // START OF A COMMENT BLOCK if (strlen(trim($cc_string)) > 0) { // There is a previous comment block that needs to be "saved". $aComments[] = $cc_string; } $cc_string = ""; $cc_string .= $linedata . " "; } else $cc_string .= $linedata . " "; $in_cc_flag = TRUE; } elseif ($in_cc_flag) { // automatically assume that $aComments contains something already. $aComments[] = trim($cc_string); $cc_string = ""; $in_cc_flag = FALSE; } // ID - IDENTIFICATION data field. if (left($linestr, 2) == "ID") { // OPENS if (left($linestr, 2) == "ID") $words = preg_split("/;/", substr($linestr, 5)); // May 20, 2003: Changed \s to [\s]+ below. $endc = preg_split("/[\s]+/", $words[0]); $entry_name = $endc[0]; // May 20, 2003: Added the -1 and PREG_SPLIT_NO_EMPTY arguments below. $namesrc = preg_split("/_/", $entry_name, -1, PREG_SPLIT_NO_EMPTY); $protein_name = $namesrc[0]; $protein_source = $namesrc[1]; $data_class = $endc[1]; // May 20, 2003: Enclosed $words[1] within a trim() function. $moltype = trim($words[1]); $length = (int) substr($words[2], 0, strlen($words[2])-4); } // CLOSES if (left($linestr, 2) == "ID") if (left($linestr, 2) == "AC") { // OPENS if (left($linestr, 2) == "AC") $accstr = $linedata; // May 20, 2003: Commented out the line below. We will not remove // the ; at the end of an AC line. Instead, we use PREG_SPLIT_NO_EMPTY. // $accstr = substr($accstr, 0, strlen($accstr)-1); // May 20, 2003: Added the -1, PREG_SPLIT_NO_EMPTY arguments below. // $accline = preg_split("/;/", intrim($accstr); $accline = preg_split("/;/", $accstr, -1, PREG_SPLIT_NO_EMPTY); $accession = array_merge($accession, $accline); } // CLOSES if (left($linestr, 2) == "AC") if (left($linestr, 2) == "DT") { // OPENS if (left($linestr, 2) == "DT") // DT DD-MMM-YEAR (REL. XX, COMMENT) $datestr = $linedata; $datestr = substr($datestr, 0, strlen($datestr)-1); $words = preg_split("/\(/", $datestr); // ( "DD-MMM-YEAR ", "REL. XX, COMMENT") $firstcomma = strpos($words[1], ","); // May 20, 2003: Converted $comment below into uppercase. $comment = strtoupper(trim(substr($words[1], $firstcomma+1))); /* print "DATESTR: ($datestr)"; print "<BR>"; print "COMMENT: ($comment)"; print "<BR>"; */ // ( "CREATED" => (date, rel), "LAST SEQUENCE UPDATE" => (date, rel), // "LAST ANNOTATION UPDATE" => (date, rel), COMMENT1 => (date, rel), // "COMMENT2" => (date, rel), ... ) if ($comment == "CREATED") { // OPENS if ($comment == "CREATED") // this DT line is a DATE CREATED line. $create_date = substr($words[0], 0, 11); $create_rel = substr($words[1], 5, ($firstcomma-5)); /* print "CREATE_DATE : (" . $create_date . ")"; print "<BR>"; print "CREATE_REL : (" . $create_rel . ")"; print "<BR>"; */ $date_r[$comment] = array($create_date, $create_rel); } // CLOSES if ($comment == "CREATED") elseif ($comment == "LAST SEQUENCE UPDATE") { // OPENS elseif ($comment == "LAST SEQUENCE UPDATE") $sequpd_date = substr($words[0], 0, 11); $sequpd_rel = substr($words[1], 5, ($firstcomma-5)); $date_r[$comment] = array($sequpd_date, $sequpd_rel); } // CLOSES elseif ($comment == "LAST SEQUENCE UPDATE") elseif ($comment == "LAST ANNOTATION UPDATE") { // OPENS elseif ($comment == "LAST ANNOTATION UPDATE") $notupd_date = substr($words[0], 0, 11); $notupd_rel = substr($words[1], 5, ($firstcomma-5)); $date_r[$comment] = array($notupd_date, $notupd_rel); } // CLOSES elseif ($comment == "LAST ANNOTATION UPDATE") else { // OPENS else part of if ($comment == "CREATED") // For now, we do not check vs. duplicate comments. // We just overwrite the older comment with new one. $other_comment = $comment; $other_date = substr($words[0], 0, 11); $other_rel = substr($words[1], 5, ($firstcomma-5)); $date_r[$comment] = array($other_date, $other_rel); } // CLOSES else part of if ($comment == "CREATED") } // CLOSES if (left($linestr, 2) == "DT") // DE - DESCRIPTION data field. May be one or more lines. Concatenate and store as one string. // Keyword (FRAGMENT) or (FRAGMENTS) may be found at the end of this string. if (left($linestr, 2) == "DE") { // OPENS if (left($linestr, 2) == "DE") $desc_lnctr++; $linestr = $linedata; if ($desc_lnctr == 1) $desc .= $linestr; else $desc .= " " . $linestr; // Checks if (FRAGMENT) or (FRAGMENTS) is found at the end // of the DE line to determine if sequence is complete. if (right($linestr, 1) == ".") { // OPENS if (right($linestr, 1) == ".") if ( (strtoupper(right($linestr, 11)) == "(FRAGMENT).") or (strtoupper(right($linestr, 12)) == "(FRAGMENTS).") ) $is_fragment = TRUE; else $is_fragment = FALSE; } // CLOSE if (right($linestr, 1) == ".") } // CLOSES if (left($linestr, 2) == "DE") // KW - KEYWORDS data field. if ($linelabel == "KW") { $kw_str .= $linedata; if ($lineend == ".") { $kw_str = rem_right($kw_str); $kw_r = preg_split("/;/", $kw_str); array_walk($kw_r, "trim_element"); $kw_str = ""; } } /* OS - ORGANISM SPECIES data field. In most cases, one line, one entry (phrase) terminated by a period (.). In some cases, one line, with two or more entries separated by ", AND". In rare cases, two or more lines, each entry separated by ", AND", and all the lines are to be concatenated. The last character of the last line is a period (.). OS entries will be converted to UPPERCASE. Storage: ( "org species1", "org species2", ... ) */ if ($linelabel == "OS") { // OPENS if ($linelabel == "OS") $os_linectr++; if ($lineend != ".") { // we are not yet at the last OS line. if ($os_linectr == 1) $os_str .= $linedata; else $os_str .= " $linedata"; } else { // we are at the last OS line. $os_str .= " $linedata"; // May 20, 2003: Convert OS entry to uppercase by enclosing within STRTOUPPER function. $os_str = strtoupper(rem_right($os_str)); $os_line = preg_split("/\, AND /", $os_str); } } // CLOSES if ($linelabel == "OS") // OG - ORGANELLE data field. if ($linelabel == "OG") $organelle = rem_right($linedata); // OC - ORGANISM (TAXONOMIC) CLASSIFICATION data field. if ($linelabel == "OC") { $oc_linectr++; if ($lineend != ".") { // we are not yet at the last OS line. if ($oc_linectr == 1) $oc_str .= $linedata; else $oc_str .= " $linedata"; } else { // we are at the last OS line. $oc_str .= " $linedata"; $oc_str = rem_right($oc_str); $oc_line = preg_split("/;/", $oc_str); array_walk($oc_line, "trim_element"); } } // FT - FEATURES TABLE data field. if ($linelabel == "FT") { $ft_key = trim(substr($linestr, 5, 8)); $ft_from = (int) trim(substr($linestr, 14, 6)); $ft_to = (int) trim(substr($linestr, 21, 6)); $ft_desc = rem_right(trim(substr($linestr, 34))); $ft_r[] = array($ft_key, $ft_from, $ft_to, $ft_desc); } // ( rn => ( "rp" => "my rp", "rc" => ("tok1" => "value", ...) ) ) // ( 10 => ( "RP" => "my rp", "RC" => ("PLASMID" => "PLA_VAL", ... ) ) ) // Example: DR AARHUS/GHENT-2DPAGE; 8006; IEF. /* DR - DATABASE (CROSS) REFERENCE data field. DATA_BANK_IDENTIFIER; PRIMARY_IDENTIFIER; SECONDARY_IDENTIFIER We assume that all three data items are mandatory/present in all DR entries. ( refno => ( (dbname1, pid1, sid1), (dbname2, pid2, sid2), ... ), 1 => ( ... ) ) ( 0 => ( (REBASE, pid1, sid1), (WORPEP, pid2, sid2), ... ), 1 => ( ... ) ) */ if ($linelabel == "DR") { $linedata = rem_right($linedata); $dr_line = preg_split("/;/", $linedata); array_walk($dr_line, "trim_element"); $db_name = $dr_line[0]; $db_pid = $dr_line[1]; $db_sid = $dr_line[2]; $db_r[] = array($db_name, $db_pid, $db_sid); } /* GN - GENES data field. Lists the name(s) of genes that encode for the protein entry. Assumed to be exactly one line. Each gene is separated by keyword AND. Synonyms for genes are separated by OR. In rare cases, there are ANDs mixed with ORs. Grouping paren- theses ( or ) are allowed. Store as an array. Examples: GNAME1 OR GNAME2 ( (GNAME1, GNAME2) ) GNAME1 AND GNAME2 ( (GNAME1), (GNAME2) ) GNAME1 AND (GNAME2 OR GNAME3) ( (GNAME1), (GNAME2, GNAME3) ) GNAME1 OR (GNAME2 AND GNAME3) NOT POSSIBLE!!! */ if (left($linestr, 2) == "GN") { // OPENS if (left($linestr, 2) == "GN") /* ALGORITHM: 1) Split expressions by " AND ". 2) Test each "token" if in between parentheses or not. 3) If not, then token is a singleton, else it's a multiple-ton. 4) Singletons are translated into (GNAME1). Multiple-tons are translated into (GNAME1, GNAME 2). 5) Push gene name array into larger array. Go to next token. */ // Remove "GN " at the beginning of our line. $linestr = trim(substr($linestr, 5)); // Remove the last character which is always a period. $linestr = substr($linestr, 0, strlen($linestr)-1); // Go here if you detect at least one ( or ). if ( is_false(strpos($linestr, "(")) ) { // GN Line does not contain any parentheses. // Ergo, it is made up of all OR's or AND's but not both. if (strpos($linestr, " OR ") != FALSE) { // Case 1: GNAME1 OR GNAME2. $temp = preg_split("/ OR /", $linestr); $gename_r[] = $temp; } elseif (strpos($linestr, " AND ") != FALSE) { // Case 2: GNAME1 AND GNAME2 AND GNAME3. $temp = preg_split("/ AND /", $linestr); foreach($temp as $gene) $gename_r[] = array($gene); } else $gename_r[] = array($linestr); // Case 0: GN GENENAME1. One gene name (no OR, AND). } else { // OPENS else part of if ( is_false(strpos($linestr, "(")) ) // GN Line contains at least one pair of parentheses. // Case 3: GNAME1 AND (GNAME2 OR GNAME3) => ( (GNAME1), (GNAME2, GNAME3) ) // COMMENTS # 1 below. $temp = preg_split("/ AND /", $linestr); foreach($temp as $gene) { // OPENS foreach($temp as $gene) if (substr($gene, 0, 1) == "(") { // a list of 2 or more gene names OR'ed together // remove the "(" and ")" at both ends of the string. $gene = substr($gene, 1); $gene = substr($gene, 0, strlen($gene)-1); $genelist = preg_split("/ OR /", $gene); $gename_r[] = $genelist; } else { // singleton $gename_r[] = array($gene); } } // CLOSES foreach($temp as $gene) } // CLOSES else part of if ( is_false(strpos($linestr, "(")) ) } // CLOSES if (left($linestr, 2) == "GN") /* SQ - SEQUENCE data field. 0123456789012345678901234567890123456789 SQ SEQUENCE XXXX AA; XXXXX MW; XXXXX CN; */ if ($linelabel == "SQ") { // OPENS if ($linelabel == "SQ") $linedata = rem_right($linedata); // XXXX AA, XXXX MW, XXXX CN $words = preg_split("/;/", substr($linedata, 8)); $aa = preg_split("/\s+/", trim($words[0])); $aa_count = (int) trim($aa[0]); $mw = preg_split("/\s+/", trim($words[1])); $mol_wt = (int) trim($mw[0]); $cn = preg_split("/\s+/", trim($words[2])); $chk_no = trim($cn[0]); $chk_method = trim($cn[1]); $sequence = ""; while ( list($no, $linestr) = each($flines) ) { $linelabel = left($linestr, 2); if ($linelabel == "//") break; $linedata = intrim(trim($linestr)); $sequence .= $linedata; } } // CLOSES if ($linelabel == "SQ") // RN - REFERENCE NUMBER data field. if ($linelabel == "RN") { // OPENS "RN" // Remove the [ and ] between the reference number. $refno = substr(rem_right($linedata), 1); $rc_ctr = 0; $rc_str = ""; $rc_flag = FALSE; $inner_r = array(); while ( list($no, $linestr) = each($flines) ) { // OPENS 2nd WHILE $linelabel = left($linestr, 2); $linedata = trim(substr($linestr, 5)); $lineend = right($linedata, 1); if ($linelabel == "RP") $inner_r["RP"] = $linedata; elseif ($linelabel == "RC") { // OPENS elseif ($linelabel == "RC") $rc_str .= $linedata; while ( list($no, $linestr) = each($flines) ) { // OPENS 3rd WHILE $linelabel = left($linestr, 2); $linedata = trim(substr($linestr, 5)); $lineend = right($linedata, 1); if ($linelabel == "RC") $rc_str .= " $linedata"; else { // opens else // May 20, 2003: I dislike this use of PREV(). It messes up our ARRAY POINTER. // It causes the ARRAY POINTER to get out of synch with the value of $linestr, // $linelabel and $linedata. prev($flines); break; } // closes else } // CLOSES 3rd WHILE // we remove the last character if it is ";" $rc_str = trim($rc_str); if (right($rc_str,1) == ";") $rc_str = rem_right($rc_str); $rc_line = preg_split("/;/", trim($rc_str)); array_walk($rc_line, "trim_element"); $innermost = array(); foreach($rc_line as $tokval_str) { // here we assume that there is no whitespace // before or after (left or right of) the "=". $tokval_r = preg_split("/=/", $tokval_str); $token = $tokval_r[0]; $value = $tokval_r[1]; $innermost[$token] = $value; } $inner_r["RC"] = $innermost; } // CLOSES elseif ($linelabel == "RC") elseif ($linelabel == "RM") { // We have no idea what RM is about, so we assume it's a single-line entry. // which may occur 0 to 1 times inside a SWISSPROT SEQUENCE RECORD. $inner_r["RM"] = $linedata; } elseif ($linelabel == "RX") { $linedata = rem_right($linedata); $rx_line = preg_split("/;/", intrim($linedata)); $inner_r["RX_BDN"] = $rx_line[0]; $inner_r["RX_ID"] = $rx_line[1]; } elseif ($linelabel == "RA") { $ra_ctr++; if ($ra_ctr == 1) $ra_str = $linedata; else $ra_str .= " $linedata"; if ($lineend == ";") { $ra_str = rem_right($ra_str); $ra_r = preg_split("/\,/", $ra_str); array_walk($ra_r, "trim_element"); $inner_r["RA"] = $ra_r; } } elseif ($linelabel == "RL") { $rl_ctr++; if ($rl_ctr == 1) $rl_str = $linedata; else $rl_str .= " $linedata"; } else { $inner_r["RL"] = $rl_str; prev($flines); break; } } // CLOSES 2nd WHILE $ref_r[$refno-1] = $inner_r; $ra_str = ""; $ra_ctr = 0; $rl_str = ""; $rl_ctr = 0; } // CLOSES "RN" } // CLOSES 1st (outermost) while ( list($no, $linestr) = each($flines) ) $seqobj = new seq(); $seqobj->id = $protein_name; $seqobj->seqlength = $length; $seqobj->moltype = $moltype; $seqobj->date = $create_date; $seqobj->accession = $accession[0]; // May 20, 2003: Commented out the line below which removes the first // element from the ACCESSION array. I also removed the line which // assigns a value to the SEC_ACCESSION property/attribute. Get the // other accession numbers from the SWISSPROT attribute set further down. // array_shift($accession); // $seqobj->sec_accession = $accession; $seqobj->source = $os_line; $seqobj->organism = $oc_line; $seqobj->sequence = $sequence; $seqobj->definition = $desc; $seqobj->keywords = $kw_r; $genbank_ref_r = array(); $inner_r = array(); foreach($ref_r as $key => $value) { $inner_r["REFNO"] = $key; $db_id = $value["RX_BDN"]; $inner_r[$db_id] = $value["RX_ID"]; $inner_r["REMARKS"] = $value["RP"]; $inner_r["COMMENT"] = $value["RC"]; $inner_r["TITLE"] = $value["RL"]; $inner_r["JOURNAL"] = $value["RL"]; $inner_r["AUTHORS"] = $value["RA"]; $genbank_ref_r[] = $inner_r; } $seqobj->reference = $genbank_ref_r; $swiss = array(); $swiss["ID"] = $protein_name; $swiss["PROT_NAME"] = $protein_name; $swiss["MOL_TYPE"] = $moltype; $swiss["PROT_SOURCE"] = $protein_source; $swiss["DATA_CLASS"] = $data_class; $swiss["LENGTH"] = $length; $swiss["CREATE_DATE"] = $create_date; $swiss["CREATE_REL"] = $create_rel; $swiss["SEQUPD_DATE"] = $sequpd_date; $swiss["SEQUPD_REL"] = $sequpd_rel; $swiss["NOTUPD_DATE"] = $notupd_date; $swiss["NOTUPD_REL"] = $notupd_rel; // ACCESSION is an ARRAY. $swiss["ACCESSION"] = $accession; $swiss["PRIM_AC"] = $accession[0]; $swiss["DESC"] = $desc; $swiss["IS_FRAGMENT"] = $is_fragment; // KEYWORDS is an ARRAY. $swiss["KEYWORDS"] = $kw_r; // ORGANISM is an ARRAY. $swiss["ORGANISM"] = $os_line; $swiss["ORGANELLE"] = $organelle; // FT_<keyword> is an ARRAY. process_ft($swiss, $ft_r); $swiss["AMINO_COUNT"] = $aa_count; $swiss["MOLWT"] = $mol_wt; $swiss["CHK_NO"] = $chk_no; $swiss["CHK_METHOD"] = $chk_method; $swiss["SEQUENCE"] = $sequence; // GENE_NAME is an ARRAY. $swiss["GENE_NAME"] = $gename_r; // ORG_CLASS is an ARRAY. $swiss["ORG_CLASS"] = $oc_line; // REFERENCE is an ARRAY. $swiss["REFERENCE"] = $ref_r; // May 20, 2003: Added a statement that stores contents of the DR section/field into Seq class. // DR data field $swiss["DR"] = $db_r; // May 20, 2003: Added a statement that stores contents of the CC section/field into Seq class. // CC data field $swiss["COMMENTS"] = $aComments; $seqobj->swissprot = $swiss; // ARRAY return $seqobj; } // CLOSES parse_swissprot() // parse_id() parses a GenBank data file and returns a Seq object containing parsed data. function parse_id($flines, $sql_db = "NONE") { $seqarr = array(); $inseq_flag = false; $seqdata_flag = false; $accession_flag = false; $ref_array = array(); $feature_array = array(); $entry_ctr = 0; $ref_ctr = 0; $maxlength = 0; $minlength = 999999; $tot_seqlength = 0; while( list($lineno, $linestr) = each($flines) ) { // OPENS outermost while( list($lineno, $linestr) = each($flines) ) if (substr($linestr,0,5) == "LOCUS") { $entry_ctr++; $ref_ctr = 0; $ref_array = array(); // This is the beginning of a SEQUENCE ENTRY. $seqdata = ""; $seqobj = new seq(); // to be removed later. i am retaining this to avoid // unexpected errors. $seqobj->id = trim(substr($linestr, 12, 16)); $seqobj_id = trim(substr($linestr, 12, 16)); $seqobj_seqlength = trim(substr($linestr, 29, 11)) * 1; $tot_seqlength += $seqobj->seqlength; if ($seqobj->seqlength > $maxlength) $maxlength = $seqobj->seqlength; if ($seqobj->seqlength < $minlength) $minlength = $seqobj->seqlength; $seqobj_moltype = substr($linestr, 47, 6); $seqobj_date = strtoupper(substr($linestr, 68, 11)); // OPEN THE MYSQL DATABASE ================================= if ($sql_db == "NONE") {} else { $dbid = mysql_pconnect("localhost", "", ""); if ($dbid == FALSE) die("Cannot open GenePHP database"); mysql_select_db($sql_db) or die("Cannot select GenePHP database!"); // PRIME_ACC LOCUS SEQ_LENGTH STRANDS MOL_TYPE TOPOLOGY // DIVISION DATE SOURCE SEQUENCE // For now, we assume that LOCUS = PRIME_ACC. $sql = "INSERT INTO sequence VALUES('$seqobj_id', '$seqobj_id', $seqobj_seqlength, '', '$seqobj_moltype', '', '', '$seqobj_date', '', '')"; $res = mysql_query($sql); if ($res == FALSE) die("Query failed!"); } /* if (substr($linestr, 44, 3) == "ss-") $seqobj->strands = "SINGLE"; elseif (substr($linestr, 44, 3) == "ds-") $seqobj->strands = "DOUBLE"; elseif (substr($linestr, 44, 3) == "ms-") $seqobj->strands = "MIXED"; $seqobj->topology = strtoupper(substr($linestr, 55, 8)); $seqobj->division = strtoupper(substr($linestr, 64, 3)); */ $inseq_flag = true; } if (trim(substr($linestr,0,10)) == "BASE COUNT") { if (count($feat_r) > 0) $seqobj->features = $feat_r; } if (trim(substr($linestr,0,12)) == "FEATURES") { // OPENS if (trim(substr($linestr,0,12)) == "FEATURES") // The REFERENCE section was present for this SEQUENCE ENTRY so we set REFERENCE attribute. if (count($ref_array) > 0) $seqobj->reference = $ref_array; $lastsubkey = ""; $feat_r = array(); $qual_r = array(); // Go to the next line. list($lineno, $linestr) = each($flines); // This loops through each line in the entire FEATURES SECTION. while( substr($linestr,0,10) != "BASE COUNT" ) { // FEATURES WHILE LOOP $label = trim(substr($linestr,0,21)); $data = trim(substr($linestr,21)); if (strlen($label) != 0) { // At the beginning of a new SUBKEY. $subkey = $label; // Add/save the qualifier array (qual_r) of the previous SUBKEY to our big feat_r array. if (count($qual_r) > 0) { $feat_r[$lastsubkey] = $qual_r; $qual_r = array(); } $qual = $subkey; $qual_r[$qual] = ""; $qual_ctr = 0; do { // QUALIFIER WHILE LOOP $qual_ctr++; $qual_r[$qual] .= " " . $data; list($lineno, $linestr) = each($flines); $label = trim(substr($linestr,0,21)); $data = trim(substr($linestr,21)); } while( is_blank($label) and !(isa_qualifier($data)) ); if (!(is_blank($label))) { $lastsubkey = $subkey; $subkey = $label; } } else { // we are inside a subkey section but on the 2nd, 3rd, nth line which have blank LABELS. if (isa_qualifier($data)) { $wordarray = preg_split("/=/", $data); $qual = $wordarray[0]; $data = $wordarray[1]; $qual_r[$qual] = ""; $qual_ctr = 0; do { // QUALIFIER WHILE LOOP $qual_ctr++; $qual_r[$qual] .= " " . $data; list($lineno, $linestr) = each($flines); $label = trim(substr($linestr,0,21)); $data = trim(substr($linestr,21)); } while( is_blank($label) and !(isa_qualifier($data)) ); if (!(is_blank($label))) { $lastsubkey = $subkey; $subkey = $label; } } } // ELSE PART of if (strlen($subkey) != 0) } // FEATURES WHILE LOOP if (count($qual_r) > 0) { $feat_r[$lastsubkey] = $qual_r; $qual_r = array(); } prev($flines); } // CLOSES if (trim(substr($linestr,0,12)) == "FEATURES") if (substr($linestr,0,10) == "DEFINITION") { $wordarray = explode(" ", $linestr); array_shift($wordarray); $seqobj->definition = implode(" ", $wordarray); } if ($inseq_flag == TRUE) { // OPENS if ($inseq_flag == TRUE) if (trim(substr($linestr, 0, 12)) == "REFERENCE") { // at this point, we are at the line with REFERENCE x (base y of z) in it. $wordarray = preg_split("/\s+/", trim(substr($linestr,12))); $ref_rec = array(); $ref_rec["REFNO"] = $wordarray[0]; array_shift($wordarray); $ref_rec["BASERANGE"] = implode(" ", $wordarray); $lastsubkey = ""; $subkey_lnctr = 0; while( list($lineno, $linestr) = each($flines) ) { $subkey = trim(substr($linestr,0,12)); // If current subkey is blank string, then this is a continuation of the last subsection. if (strlen($subkey) == 0) $subkey = $lastsubkey; // If we are at the next subkey section (e.g. lastsubkey was AUTHORS, and current is TITLE). if ($subkey != $lastsubkey) $subkey_lnctr = 0; switch ($subkey) { case "AUTHORS": $subkey_lnctr++; $wordarray = preg_split("/\s+/", trim(substr($linestr,12))); // we remove comma at the end of a name, and the element "and". $newarray = array(); foreach($wordarray as $authname) { if (strtoupper($authname) != "AND") { if (substr($authname, strlen($authname)-1, 1) == ",") $authname = substr($authname, 0, strlen($authname)-1); $newarray[] = $authname; } } if ($subkey_lnctr == 1) $ref_rec["AUTHORS"] = $newarray; else $ref_rec["AUTHORS"] = array_merge($ref_rec["AUTHORS"], $newarray); break; case "TITLE": $subkey_lnctr++; if ($subkey_lnctr == 1) $ref_rec["TITLE"] = trim(substr($linestr,12)); else $ref_rec["TITLE"] .= " " . trim(substr($linestr,12)); break; case "JOURNAL": $subkey_lnctr++; if ($subkey_lnctr == 1) $ref_rec["JOURNAL"] = trim(substr($linestr,12)); else $ref_rec["JOURNAL"] .= " " . trim(substr($linestr,12)); break; case "MEDLINE": $ref_rec["MEDLINE"] = substr($linestr, 12, 8); break; case "PUBMED": $ref_rec["PUBMED"] = substr($linestr, 12, 8); break; case "REMARK": $subkey_lnctr++; if ($subkey_lnctr == 1) $ref_rec["REMARK"] = trim(substr($linestr,12)); else $ref_rec["REMARK"] .= " " . trim(substr($linestr,12)); break; case "COMMENT": $subkey_lnctr++; if ($subkey_lnctr == 1) $ref_rec["COMMENT"] = trim(substr($linestr,12)); else $ref_rec["COMMENT"] .= " " . trim(substr($linestr,12)); break; } if ($subkey == "FEATURES") { prev($flines); break; } if ($subkey == "REFERENCE") { $ref_ctr++; prev($flines); break; } $lastsubkey = $subkey; } array_push($ref_array, $ref_rec); } if (trim(substr($linestr, 0, 12)) == "SOURCE") { // For now, assume a single-line SOURCE field. $seqobj->source = substr($linestr, 12); } if (trim(substr($linestr, 0, 12)) == "SEGMENT") { $seqobj->segment = substr($linestr, 12); $wordarray = preg_split("/\s+/", trim(substr($linestr,12))); $seqobj->segment_no = $wordarray[0]; $seqobj->segment_count = $wordarray[2]; } // For now, assume that KEYWORDS field consists of exactly one line. if (trim(substr($linestr, 0, 12)) == "KEYWORDS") { $wordarray = preg_split("/\s+/", trim($linestr)); array_shift($wordarray); $wordarray = preg_split("/;+/", implode(" ", $wordarray)); if ($wordarray[0] != ".") $seqobj->keywords = $wordarray; } if (substr($linestr, 0, 7) == "VERSION") { // Assume that VERSION line is made up of exactly 2 or 3 tokens. $wordarray = preg_split("/\s+/", trim($linestr)); $seqobj->version = $wordarray[1]; if (count($wordarray) == 3) $seqobj->ncbi_gi_id = $wordarray[2]; $accession_flag = false; } if ($accession_flag == TRUE) { // 2nd, 3rd, etc. line of ACCESSION field. $wordarray = preg_split("/\s+/", trim($linestr)); $this->sec_accession = array_merge($this->sec_accession, $wordarray); } if (substr($linestr,0,9) == "ACCESSION") { $wordarray = preg_split("/\s+/", trim($linestr)); $seqobj->accession = $wordarray[1]; array_shift($wordarray); array_shift($wordarray); $seqobj->sec_accession = $wordarray; $accession_flag = true; } if (substr($linestr,0,10) == " ORGANISM") { $seqobj->organism = substr($linestr,12); } if (($seqdata_flag == true) && (substr($linestr,0,2) != "//")) { $wordarray = explode(" ", trim($linestr)); array_shift($wordarray); $seqline = implode("", $wordarray); $seqdata .= $seqline; } if (substr($linestr,0,6) == "ORIGIN") $seqdata_flag = true; if (substr($linestr,0,2) == "//") { $seqobj->sequence = $seqdata; $seqarr[$this->id] = $this; $seqdata_flag = false; $inseq_flag = false; break; } } // CLOSES if ($inseq_flag == TRUE) } // CLOSES outermost while( list($lineno, $linestr) = each($flines) ) $seqobj->seqarray = $seqarr; return $seqobj; } // Closes parse_id() constructor function definition // open() opens or prepares the SeqDB for processing. Opposite of close(). function open($dbname) { if (file_exists($dbname . ".idx") == FALSE) die("ERROR: Index file $dbname.IDX does not exist!"); if (file_exists($dbname . ".dir") == FALSE) die("ERROR: Index file $dbname.DIR does not exist!"); $this->dbname = $dbname; $this->data_fn = $dbname . ".idx"; $this->dir_fn = $dbname . ".dir"; $this->seqptr = 0; } // close() closes the SeqDB database after we're through using it. Opposite of open() method. function close() { // // Close simply assigns null values to attributes of the seqdb() object. // Methods like fetch would not function properly if these values are null. $this->dbname = ""; $this->data_fn = ""; $this->dir_fn = ""; $this->seqptr = -1; } /* SeqDB() is the constructor method for the SeqDB class. It does many things like create and/or read a database's index files, initialize certain SeqDB properties, etc. Syntax: $seqdb = new seqdb($dbname, $dbformat, $file1, $file2, ...); Behavior: if $dbname exists and user gave no specific values for $file1, $file2, ... then seqdb() object USES/OPENS existing database (index files). if $dbname exists and user gave specific values for $file1, $file2, ... then seqdb() object OVERWRITES existing database (index files). if $dbname does not exist, then seqdb() object CREATES new database. even if $file1, $file2, ... are not specified. We provide the create() method to explicitly create a new database. We provide the use() or open() method to explicitly use an existing database. */ function SeqDB() { // Get all the arguments passed to this function. $args = func_get_args(); $dbname = $args[0]; $dbformat = strtoupper($args[1]); if (strlen($dbformat) == 0) $dbformat = "GENBANK"; $this->dbformat = $dbformat; $datafile = array(); for($i = 2; $i < count($args); $i++) $datafile[] = $args[$i]; /* db exists fileX args ACTION TESTED Y Y create okay Y N use N Y create okay N N create okay */ // if user provided specific values for $file1, $file2, ... parameters. if ((file_exists($dbname)) and (count($datafile) > 0)) { // For now, assume USING/OPENING a database is to be done in READ ONLY MODE. $this->open($dbname); } else { // March 26, 2003: I switched the 2 lines below with // the line: $this->open($dbname); to avoid die-ing with // the error message: "ERROR: Index file does not exist!" $fp = fopen($dbname . ".idx", "w+"); $fpdir = fopen($dbname . ".dir", "w+"); // Creates blank data and directory index files, and sets seqptr to 0, etc. $this->open($dbname); // if user did not provide any datafile name. if (count($datafile) == 0) return; $temp_r = array(); // Build our *.DIR file foreach($datafile as $fileno=>$filename) { $outline = "$fileno $filename\n"; fputs($fpdir, $outline); // Automatically create an index file containing info across all data files. $flines = file($filename); $totlines = count($flines); while( list($lineno, $linestr) = each($flines) ) { // if (substr($linestr,0,5) == "LOCUS") if (at_entrystart($linestr, $dbformat)) { // $current_id = trim(substr($linestr, 12, 16)); $current_id = get_entryid($flines, $linestr, $dbformat); $outline = "$current_id $fileno $lineno\n"; // Put entries in an array first, sort them, then write to *.IDX file. // temp_r = ("AB1234" => ("AB1234", 1, 12), "BC4321" => ... ); $temp_r[$current_id] = array($current_id, $fileno, $lineno); } } // Sort our array by its keys. ksort($temp_r); } // Build our *.IDX array. $this->seqcount = count($temp_r); foreach($temp_r as $seqid=>$line_r) { $outline = $line_r[0] . " " . $line_r[1] . " " . $line_r[2] . "\n"; $fio = fputs($fp, $outline); } } fclose($fp); fclose($fpdir); } // CLOSES definition of SeqDB constructor function. } // CLOSES definition of SEQDB CLASS. class DataScan { // OPENS definition of SEQDB CLASS. var $seqcount; var $dbformat; var $udf; var $results; // var $seqptr; // var $bof; // var $eof; /* Constructor method for the DataScan class. Syntax: Datascan($db (e.g. "genbank") to uppercase (e.g. "GENBANK") $dbformat = strtoupper($args[0]); // If 'dbformat' not provided, set it to 'dbformat' property of this object. if (strlen($dbformat) == 0) $dbformat = "GENBANK"; $this->dbformat = $dbformat; // Store 2nd argument (name of function to apply to each record) in $funcname string variable. $udfuncname = $args[1]; // Put 3rd, 4th, etc. arguments in an array named $datafile. $datafile = array(); for($i = 2; $i < count($args); $i++) $datafile[] = $args[$i]; // if $datafile is empty, quit this function. if (count($datafile) == 0) return; // Set counter of records in file to 0. $rec_ctr = 0; // Initialize array that will hold all return values from each record in file. $aRetval = array(); foreach($datafile as $filename) { // Transfer entire contents of file to $flines array. $flines = file($filename); $in_entry = FALSE; $entry_end = FALSE; while( list($lineno, $linestr) = each($flines) ) { if (at_entrystart($linestr, $dbformat)) { $in_entry = TRUE; $entry_end = FALSE; $rec_ctr++; $record = array(); } if (at_entryend($linestr, $dbformat)) { $in_entry = FALSE; $entry_end = TRUE; } // while inside a record, keep adding each line to the $record array. if ($in_entry == TRUE) $record[] = $linestr; elseif ($entry_end == TRUE) { // run the UDF $funcname with record contents ($record) and $dbformat as arguments. $call = "\$retval = $udfuncname(\$record, \$dbformat)" . ";"; eval($call); // add value (returned by applying UDF to the current record) to array of return values ($aRetval). $aRetval[] = $retval; $entry_end = FALSE; } } } // Set value of 'seqcount' attribute to the last value of our record counter. $this->seqcount = $rec_ctr; // Return array of return values. $this->results = $aRetval; } // CLOSES definition of DataScan constructor function. } /* FEATURES EXAMPLE: FEATURES Location/Qualifiers source 1..252 /organism="unidentified soil organism R6-11" /db_xref="taxon:44781" /clone="11" /clone_lib="R6" rRNA <1..>252 /product="16S ribosomal RNA" primer_bind 234..252 */ ?>

[ Home Page ] [ I/O Scripts Page ]

 


Copyright © 2003 by Sergio Gregorio, Jr.
All rights reserved.