![]() |
BioPHP: PHP for Biocomputing |
![]() |
Source Code Listing for seqdb.inc Last updated: April 7, 2003 <?php require_once("etc.inc"); require_once("seq.inc"); // ================== FUNCTIONS ======================== /* We begin by describing parse_swissprot() first. parse_swissprot() parses the Feature Table lines (those that begin with FT) in a Swissprot data file, extracts the feature key name, from endpoint, to endpoint, and description, and stores them in a (simple) array. process_ft() then pushes this array into a larger associative array, called $swiss, which is also an attribute of the Seq object. It is assigned a key of the form: FT_(feature_key_name). Examples are: FT_PEPTIDE, FT_DISULFID. */ function process_ft(&$swiss, $ft_r) { foreach($ft_r as $element) { $index = "FT_" . $element[0]; array_shift($element); if (count($swiss[$index]) == 0) { $swiss[$index] = array(); array_push($swiss[$index], $element); } else array_push($swiss[$index], $element); } } // at_entrystart() tests if the file pointer is at the start of a new sequence entry. function at_entrystart($linestr, $dbformat) { if ($dbformat == "GENBANK") return (substr($linestr,0,5) == "LOCUS"); elseif ($dbformat == "SWISSPROT") return (substr($linestr,0,2) == "ID"); } // get_entryid() gets the primary accession number of the sequence entry which we are // currently processing. This uniquely identifies a sequence entry. function get_entryid(&$flines, $linestr, $dbformat) { if ($dbformat == "GENBANK") return trim(substr($linestr, 12, 16)); elseif ($dbformat == "SWISSPROT") { list($lineno, $linestr) = each($flines); if (substr($linestr,0,2) == "AC") { $words = preg_split("/;/", intrim(substr($linestr,5))); prev($flines); return $words[0]; } } } // line2r() copies the lines belonging to a single sequence entry into an array. function line2r($fpseq) { $flines = array(); while(1) { $linestr = fgets($fpseq, 101); $flines[] = $linestr; if (left($linestr,2) == '//') return $flines; } return FALSE; } // isa_qualifier() tests if the file pointer is at a line containing a feature qualifier. // This applies only to GenBank sequence files. function isa_qualifier($str) { if (firstchar($str) == '/') return true; else return false; } // fseekline() gets the byte offset (from beginning of file) of a particular line. The file is // identified by $fp file pointer, while the line is identified by $lineno, which is zero-based. function fseekline($fp, $lineno) { $linectr = 0; fseek($fp, 0); while(!feof($fp)) { $linestr = fgets($fp,101); if ($linectr == $lineno) { fseek($fp, $byteoff); return $byteoff; } $linectr++; $byteoff = ftell($fp); } } // bsrch_tabfile() searches for a particular sequence id ($seqid) within an *.IDX file // (identified by $fp file pointer), and returns data located in its $col-th column. function bsrch_tabfile($fp, $col, $seqid) { $linectr = 0; fseek($fp, 0); while(!feof($fp)) { fgets($fp, 41); $linectr++; } $lastline = $linectr; rewind($fp); if ($fp == FALSE) die("CANT OPEN FILE"); $searchspace = $lastline; $floor = 0; $ceiling = $lastline - 1; while(1) { $offset = ((int) ($searchspace/2)); $lineno = $floor + $offset; fseekline($fp, $lineno); $word = preg_split("/\s+/", trim(fgets($fp,81))); if ($word[$col] == $seqid) { $word[] = $lineno; return $word; } elseif ($seqid > $word[$col]) { $floor = $lineno + 1; $searchspace = $ceiling - $floor + 1; if ($searchspace <= 0) return FALSE; } else { $ceiling = $lineno - 1; $searchspace = $ceiling - $floor + 1; if ($searchspace <= 0) return FALSE; } } // fclose($fpidx); } // ================== CLASSES ======================== class SeqDB { // OPENS definition of SEQDB CLASS. var $dbname; var $data_fn; var $data_fp; var $dir_fn; var $dir_fp; var $seqptr; var $seqcount; var $dbformat; // first() positions the sequence pointer (i.e. the seqptr property of a Seq object) to // the first sequence in a database (SeqDB object). function first() { $this->seqptr = 0; } // last() positions the sequence pointer (i.e. the seqptr property of a Seq object) to // the last sequence in a database (SeqDB object). function last() { $this->seqptr = $this->seqcount-1; } // prev() (short for previous) positions the sequence pointer (i.e. the seqptr property of // a Seq object) to the sequence that comes before the current sequence. function prev() { if ($this->seqptr > 0) $this->seqptr--; } // next() positions the sequence pointer (i.e. the seqptr property of a Seq object) to the // sequence that comes after the current sequence. function next() { if ($this->seqptr < $this->seqcount-1) $this->seqptr++; } // fetch() retrieves all data from the specified sequence record and returns them in the // form of a Seq object. This method invokes one of several parser methods. function fetch() { if ($this->data_fn == "") die("Cannot invoke fetch() method from a closed object."); @$seqid = func_get_arg(0); // IDX and DIR files remain open for the duration of the FETCH() method. $fp = fopen($this->data_fn, "r"); $fpdir = fopen($this->dir_fn, "r"); if ($seqid != FALSE) { $idx_r = bsrch_tabfile($fp, 0, $seqid); if ($idx_r == FALSE) return FALSE; else $this->seqptr = $idx_r[3]; } else { // For now, SEQPTR determines CURRENT SEQUENCE ID. Alternative is to track curr line. fseekline($fp, $this->seqptr); $idx_r = preg_split("/\s+/", trim(fgets($fp, 81))); } $dir_r = bsrch_tabfile($fpdir, 0, $idx_r[1]); $fpseq = fopen($dir_r[1], "r"); fseekline($fpseq, $idx_r[2]); $flines = line2r($fpseq); $myseq = new seq(); if ($this->dbformat == "GENBANK") $myseq = $this->parse_id($flines); elseif ($this->dbformat == "SWISSPROT") $myseq = $this->parse_swissprot($flines); fclose($fp); fclose($fpdir); fclose($fpseq); return $myseq; } // parse_swissprot() parses a Swissprot data file and returns a Seq object containing parsed data. function parse_swissprot($flines) { // OPENS parse_swissprot() function $accession = array(); $date_r = array(); $desc = ""; $desc_lnctr = 0; $gename_r = array(); $os_r = array(); $os_linectr = 0; $os_str = ""; $oc_linectr = 0; $oc_str = ""; $ref_r = array(); $ra_r = array(); $ra_ctr = 0; $ra_str = ""; $rl_ctr = 0; $rl_str = ""; $db_r = array(); $ft_r = array(); $kw_str = ""; $kw_r = array(); while ( list($no, $linestr) = each($flines) ) { // OPENS 1st (outermost) while ( list($no, $linestr) = each($flines) ) $linelabel = left($linestr, 2); $linedata = trim(substr($linestr, 5)); $lineend = right($linedata, 1); if (left($linestr, 2) == "ID") { // OPENS if (left($linestr, 2) == "ID") $words = preg_split("/;/", substr($linestr, 5)); $endc = preg_split("/\s/", $words[0]); $entry_name = $endc[0]; $namesrc = preg_split("/_/", $entry_name); $protein_name = $namesrc[0]; $protein_source = $namesrc[1]; $data_class = $endc[1]; $moltype = $words[1]; $length = (int) substr($words[2], 0, strlen($words[2])-4); } // CLOSES if (left($linestr, 2) == "ID") if (left($linestr, 2) == "AC") { // OPENS if (left($linestr, 2) == "AC") $accstr = $linedata; $accstr = substr($accstr, 0, strlen($accstr)-1); $accline = preg_split("/;/", intrim($accstr)); $accession = array_merge($accession, $accline); } // CLOSES if (left($linestr, 2) == "AC") if (left($linestr, 2) == "DT") { // OPENS if (left($linestr, 2) == "DT") // DT DD-MMM-YEAR (REL. XX, COMMENT) $datestr = $linedata; $datestr = substr($datestr, 0, strlen($datestr)-1); $words = preg_split("/\(/", $datestr); // ( "DD-MMM-YEAR ", "REL. XX, COMMENT") $firstcomma = strpos($words[1], ","); $comment = trim(substr($words[1], $firstcomma+1)); // ( "CREATED" => (date, rel), "LAST SEQUENCE UPDATE" => (date, rel), // "LAST ANNOTATION UPDATE" => (date, rel), COMMENT1 => (date, rel), // "COMMENT2" => (date, rel), ... ) if ($comment == "CREATED") { // OPENS if ($comment == "CREATED") // this DT line is a DATE CREATED line. $create_date = substr($words[0], 0, 11); $create_rel = substr($words[1], 5, ($firstcomma-5)); $date_r[$comment] = array($create_date, $create_rel); } // CLOSES if ($comment == "CREATED") elseif ($comment == "LAST SEQUENCE UPDATE") { // OPENS elseif ($comment == "LAST SEQUENCE UPDATE") $sequpd_date = substr($words[0], 0, 11); $sequpd_rel = substr($words[1], 5, ($firstcomma-5)); $date_r[$comment] = array($sequpd_date, $sequpd_rel); } // CLOSES elseif ($comment == "LAST SEQUENCE UPDATE") elseif ($comment == "LAST ANNOTATION UPDATE") { // OPENS elseif ($comment == "LAST ANNOTATION UPDATE") $notupd_date = substr($words[0], 0, 11); $notupd_rel = substr($words[1], 5, ($firstcomma-5)); $date_r[$comment] = array($notupd_date, $notupd_rel); } // CLOSES elseif ($comment == "LAST ANNOTATION UPDATE") else { // OPENS else part of if ($comment == "CREATED") // For now, we do not check vs. duplicate comments. // We just overwrite the older comment with new one. $other_comment = $comment; $other_date = substr($words[0], 0, 11); $other_rel = substr($words[1], 5, ($firstcomma-5)); $date_r[$comment] = array($other_date, $other_rel); } // CLOSES else part of if ($comment == "CREATED") } // CLOSES if (left($linestr, 2) == "DT") if (left($linestr, 2) == "DE") { // OPENS if (left($linestr, 2) == "DE") $desc_lnctr++; $linestr = $linedata; if ($desc_lnctr == 1) $desc .= $linestr; else $desc .= " " . $linestr; // Checks if (FRAGMENT) or (FRAGMENTS) is found at the end // of the DE line to determine if sequence is complete. if (right($linestr, 1) == ".") { // OPENS if (right($linestr, 1) == ".") if ( (strtoupper(right($linestr, 11)) == "(FRAGMENT).") or (strtoupper(right($linestr, 12)) == "(FRAGMENTS).") ) $is_fragment = TRUE; else $is_fragment = FALSE; } // CLOSE if (right($linestr, 1) == ".") } // CLOSES if (left($linestr, 2) == "DE") if ($linelabel == "KW") { $kw_str .= $linedata; if ($lineend == ".") { $kw_str = rem_right($kw_str); $kw_r = preg_split("/;/", $kw_str); array_walk($kw_r, "trim_element"); $kw_str = ""; } } if ($linelabel == "OS") { // OPENS if ($linelabel == "OS") $os_linectr++; if ($lineend != ".") { // we are not yet at the last OS line. if ($os_linectr == 1) $os_str .= $linedata; else $os_str .= " $linedata"; } else { // we are at the last OS line. $os_str .= " $linedata"; $os_str = rem_right($os_str); $os_line = preg_split("/\, AND /", $os_str); } } // CLOSES if ($linelabel == "OS") if ($linelabel == "OG") $organelle = rem_right($linedata); if ($linelabel == "OC") { $oc_linectr++; if ($lineend != ".") { // we are not yet at the last OS line. if ($oc_linectr == 1) $oc_str .= $linedata; else $oc_str .= " $linedata"; } else { // we are at the last OS line. $oc_str .= " $linedata"; $oc_str = rem_right($oc_str); $oc_line = preg_split("/;/", $oc_str); array_walk($oc_line, "trim_element"); } } if ($linelabel == "FT") { $ft_key = trim(substr($linestr, 5, 8)); $ft_from = (int) trim(substr($linestr, 14, 6)); $ft_to = (int) trim(substr($linestr, 21, 6)); $ft_desc = rem_right(trim(substr($linestr, 34))); $ft_r[] = array($ft_key, $ft_from, $ft_to, $ft_desc); } // ( rn => ( "rp" => "my rp", "rc" => ("tok1" => "value", ...) ) ) // ( 10 => ( "RP" => "my rp", "RC" => ("PLASMID" => "PLA_VAL", ... ) ) ) // Example: DR AARHUS/GHENT-2DPAGE; 8006; IEF. if ($linelabel == "DR") { // DR DATA_BANK_IDENTIFIER; PRIMARY_IDENTIFIER; SECONDARY_IDENTIFIER // We assume that all three data items are mandatory/present in all DR entries. // ( refno => ( (dbname1, pid1, sid1), (dbname2, pid2, sid2), ... ), 1 => ( ... ) ) // ( 0 => ( (REBASE, pid1, sid1), (WORPEP, pid2, sid2), ... ), 1 => ( ... ) ) $linedata = rem_right($linedata); $dr_line = preg_split("/;/", $linedata); array_walk($dr_line, "trim_element"); $db_name = $dr_line[0]; $db_pid = $dr_line[1]; $db_sid = $dr_line[2]; $db_r[] = array($db_name, $db_pid, $db_sid); } if ($linelabel == "RN") { // OPENS "RN" // Remove the [ and ] between the reference number. $refno = substr(rem_right($linedata), 1); $rc_ctr = 0; $rc_str = ""; $rc_flag = FALSE; $inner_r = array(); while ( list($no, $linestr) = each($flines) ) { // OPENS 2nd WHILE $linelabel = left($linestr, 2); $linedata = trim(substr($linestr, 5)); $lineend = right($linedata, 1); if ($linelabel == "RP") $inner_r["RP"] = $linedata; elseif ($linelabel == "RC") { // OPENS elseif ($linelabel == "RC") $rc_str .= $linedata; while ( list($no, $linestr) = each($flines) ) { // OPENS 3rd WHILE $linelabel = left($linestr, 2); $linedata = trim(substr($linestr, 5)); $lineend = right($linedata, 1); if ($linelabel == "RC") $rc_str .= " $linedata"; else { // opens else prev($flines); break; } // closes else } // CLOSES 3rd WHILE // we remove the last character if it is ";" $rc_str = trim($rc_str); if (right($rc_str,1) == ";") $rc_str = rem_right($rc_str); $rc_line = preg_split("/;/", trim($rc_str)); array_walk($rc_line, "trim_element"); $innermost = array(); foreach($rc_line as $tokval_str) { // here we assume that there is no whitespace // before or after (left or right of) the "=". $tokval_r = preg_split("/=/", $tokval_str); $token = $tokval_r[0]; $value = $tokval_r[1]; $innermost[$token] = $value; } $inner_r["RC"] = $innermost; } // CLOSES elseif ($linelabel == "RC") elseif ($linelabel == "RM") { // We have no idea what RM is about, so we assume it's a single-line entry. // which may occur 0 to 1 times inside a SWISSPROT SEQUENCE RECORD. $inner_r["RM"] = $linedata; } elseif ($linelabel == "RX") { $linedata = rem_right($linedata); $rx_line = preg_split("/;/", intrim($linedata)); $inner_r["RX_BDN"] = $rx_line[0]; $inner_r["RX_ID"] = $rx_line[1]; } elseif ($linelabel == "RA") { $ra_ctr++; if ($ra_ctr == 1) $ra_str = $linedata; else $ra_str .= " $linedata"; if ($lineend == ";") { $ra_str = rem_right($ra_str); $ra_r = preg_split("/\,/", $ra_str); array_walk($ra_r, "trim_element"); $inner_r["RA"] = $ra_r; } } elseif ($linelabel == "RL") { $rl_ctr++; if ($rl_ctr == 1) $rl_str = $linedata; else $rl_str .= " $linedata"; } else { $inner_r["RL"] = $rl_str; prev($flines); break; } } // CLOSES 2nd WHILE $ref_r[$refno-1] = $inner_r; $ra_str = ""; $ra_ctr = 0; $rl_str = ""; $rl_ctr = 0; } // CLOSES "RN" if (left($linestr, 2) == "GN") { // OPENS if (left($linestr, 2) == "GN") // GN is always exactly one line. // GNAME1 OR GNAME2 ( (GNAME1, GNAME2) ) // GNAME1 AND GNAME2 ( (GNAME1), (GNAME2) ) // GNAME1 AND (GNAME2 OR GNAME3) ( (GNAME1), (GNAME2, GNAME3) ) // GNAME1 OR (GNAME2 AND GNAME3) NOT POSSIBLE!!! /* ALGORITHM: 1) Split expressions by " AND ". 2) Test each "token" if in between parentheses or not. 3) If not, then token is a singleton, else it's a multiple-ton. 4) Singletons are translated into (GNAME1). Multiple-tons are translated into (GNAME1, GNAME 2). 5) Push gene name array into larger array. Go to next token. */ // Remove "GN " at the beginning of our line. $linestr = trim(substr($linestr, 5)); // Remove the last character which is always a period. $linestr = substr($linestr, 0, strlen($linestr)-1); // Go here if you detect at least one ( or ). if ( is_false(strpos($linestr, "(")) ) { // GN Line does not contain any parentheses. // Ergo, it is made up of all OR's or AND's but not both. if (strpos($linestr, " OR ") != FALSE) { // Case 1: GNAME1 OR GNAME2. $temp = preg_split("/ OR /", $linestr); $gename_r[] = $temp; } elseif (strpos($linestr, " AND ") != FALSE) { // Case 2: GNAME1 AND GNAME2 AND GNAME3. $temp = preg_split("/ AND /", $linestr); foreach($temp as $gene) $gename_r[] = array($gene); } else $gename_r[] = array($linestr); // Case 0: GN GENENAME1. One gene name (no OR, AND). } else { // OPENS else part of if ( is_false(strpos($linestr, "(")) ) // GN Line contains at least one pair of parentheses. // Case 3: GNAME1 AND (GNAME2 OR GNAME3) => ( (GNAME1), (GNAME2, GNAME3) ) // COMMENTS # 1 below. $temp = preg_split("/ AND /", $linestr); foreach($temp as $gene) { // OPENS foreach($temp as $gene) if (substr($gene, 0, 1) == "(") { // a list of 2 or more gene names OR'ed together // remove the "(" and ")" at both ends of the string. $gene = substr($gene, 1); $gene = substr($gene, 0, strlen($gene)-1); $genelist = preg_split("/ OR /", $gene); $gename_r[] = $genelist; } else { // singleton $gename_r[] = array($gene); } } // CLOSES foreach($temp as $gene) } // CLOSES else part of if ( is_false(strpos($linestr, "(")) ) } // CLOSES if (left($linestr, 2) == "GN") // 0123456789012345678901234567890123456789 // SQ SEQUENCE XXXX AA; XXXXX MW; XXXXX CN; if ($linelabel == "SQ") { // OPENS if ($linelabel == "SQ") $linedata = rem_right($linedata); // XXXX AA, XXXX MW, XXXX CN $words = preg_split("/;/", substr($linedata, 8)); $aa = preg_split("/\s+/", trim($words[0])); $aa_count = (int) trim($aa[0]); $mw = preg_split("/\s+/", trim($words[1])); $mol_wt = (int) trim($mw[0]); $cn = preg_split("/\s+/", trim($words[2])); $chk_no = trim($cn[0]); $chk_method = trim($cn[1]); $sequence = ""; while ( list($no, $linestr) = each($flines) ) { $linelabel = left($linestr, 2); if ($linelabel == "//") break; $linedata = intrim(trim($linestr)); $sequence .= $linedata; } } // CLOSES if ($linelabel == "SQ") } // CLOSES 1st (outermost) while ( list($no, $linestr) = each($flines) ) $seqobj = new seq(); $seqobj->id = $protein_name; $seqobj->seqlength = $length; $seqobj->moltype = $moltype; $seqobj->date = $create_date; $seqobj->accession = $accession[0]; array_shift($accession); $seqobj->sec_accession = $accession; $seqobj->source = $os_line; $seqobj->organism = $oc_line; $seqobj->sequence = $sequence; $seqobj->definition = $desc; $seqobj->keywords = $kw_r; $genbank_ref_r = array(); $inner_r = array(); foreach($ref_r as $key => $value) { $inner_r["REFNO"] = $key; $db_id = $value["RX_BDN"]; $inner_r[$db_id] = $value["RX_ID"]; $inner_r["REMARKS"] = $value["RP"]; $inner_r["COMMENT"] = $value["RC"]; $inner_r["TITLE"] = $value["RL"]; $inner_r["JOURNAL"] = $value["RL"]; $inner_r["AUTHORS"] = $value["RA"]; $genbank_ref_r[] = $inner_r; } $seqobj->reference = $genbank_ref_r; $swiss = array(); $swiss["ID"] = $protein_name; $swiss["PROT_NAME"] = $protein_name; $swiss["MOL_TYPE"] = $moltype; $swiss["PROT_SOURCE"] = $protein_source; $swiss["DATA_CLASS"] = $data_class; $swiss["LENGTH"] = $length; $swiss["CREATE_DATE"] = $create_date; $swiss["CREATE_REL"] = $create_rel; $swiss["SEQUPD_DATE"] = $sequpd_date; $swiss["SEQUPD_REL"] = $sequpd_rel; $swiss["NOTUPD_DATE"] = $notupd_date; $swiss["NOTUPD_REL"] = $notupd_rel; // ACCESSION is an ARRAY. $swiss["ACCESSION"] = $accession; $swiss["PRIM_AC"] = $accession[0]; $swiss["DESC"] = $desc; $swiss["IS_FRAGMENT"] = $is_fragment; // KEYWORDS is an ARRAY. $swiss["KEYWORDS"] = $kw_r; // ORGANISM is an ARRAY. $swiss["ORGANISM"] = $os_line; $swiss["ORGANELLE"] = $organelle; // FT_ |