![]() |
BioPHP: PHP for Biocomputing |
![]() |
|
Source Code Listing for seqdb.inc Last updated: April 7, 2003 <?php
require_once("etc.inc");
require_once("seq.inc");
// ================== FUNCTIONS ========================
/*
We begin by describing parse_swissprot() first.
parse_swissprot() parses the Feature Table lines (those that begin with FT) in a Swissprot
data file, extracts the feature key name, from endpoint, to endpoint, and description, and
stores them in a (simple) array.
process_ft() then pushes this array into a larger associative array, called $swiss, which is
also an attribute of the Seq object. It is assigned a key of the form: FT_(feature_key_name).
Examples are: FT_PEPTIDE, FT_DISULFID.
*/
function process_ft(&$swiss, $ft_r)
{
foreach($ft_r as $element)
{
$index = "FT_" . $element[0];
array_shift($element);
if (count($swiss[$index]) == 0)
{
$swiss[$index] = array();
array_push($swiss[$index], $element);
}
else array_push($swiss[$index], $element);
}
}
// at_entrystart() tests if the file pointer is at the start of a new sequence entry.
function at_entrystart($linestr, $dbformat)
{
if ($dbformat == "GENBANK")
return (substr($linestr,0,5) == "LOCUS");
elseif ($dbformat == "SWISSPROT")
return (substr($linestr,0,2) == "ID");
}
// get_entryid() gets the primary accession number of the sequence entry which we are
// currently processing. This uniquely identifies a sequence entry.
function get_entryid(&$flines, $linestr, $dbformat)
{
if ($dbformat == "GENBANK")
return trim(substr($linestr, 12, 16));
elseif ($dbformat == "SWISSPROT")
{
list($lineno, $linestr) = each($flines);
if (substr($linestr,0,2) == "AC")
{
$words = preg_split("/;/", intrim(substr($linestr,5)));
prev($flines);
return $words[0];
}
}
}
// line2r() copies the lines belonging to a single sequence entry into an array.
function line2r($fpseq)
{
$flines = array();
while(1)
{
$linestr = fgets($fpseq, 101);
$flines[] = $linestr;
if (left($linestr,2) == '//') return $flines;
}
return FALSE;
}
// isa_qualifier() tests if the file pointer is at a line containing a feature qualifier.
// This applies only to GenBank sequence files.
function isa_qualifier($str)
{
if (firstchar($str) == '/') return true;
else return false;
}
// fseekline() gets the byte offset (from beginning of file) of a particular line. The file is
// identified by $fp file pointer, while the line is identified by $lineno, which is zero-based.
function fseekline($fp, $lineno)
{
$linectr = 0;
fseek($fp, 0);
while(!feof($fp))
{
$linestr = fgets($fp,101);
if ($linectr == $lineno)
{
fseek($fp, $byteoff);
return $byteoff;
}
$linectr++;
$byteoff = ftell($fp);
}
}
// bsrch_tabfile() searches for a particular sequence id ($seqid) within an *.IDX file
// (identified by $fp file pointer), and returns data located in its $col-th column.
function bsrch_tabfile($fp, $col, $seqid)
{
$linectr = 0;
fseek($fp, 0);
while(!feof($fp))
{
fgets($fp, 41);
$linectr++;
}
$lastline = $linectr;
rewind($fp);
if ($fp == FALSE) die("CANT OPEN FILE");
$searchspace = $lastline;
$floor = 0;
$ceiling = $lastline - 1;
while(1)
{
$offset = ((int) ($searchspace/2));
$lineno = $floor + $offset;
fseekline($fp, $lineno);
$word = preg_split("/\s+/", trim(fgets($fp,81)));
if ($word[$col] == $seqid)
{
$word[] = $lineno;
return $word;
}
elseif ($seqid > $word[$col])
{
$floor = $lineno + 1;
$searchspace = $ceiling - $floor + 1;
if ($searchspace <= 0) return FALSE;
}
else
{
$ceiling = $lineno - 1;
$searchspace = $ceiling - $floor + 1;
if ($searchspace <= 0) return FALSE;
}
}
// fclose($fpidx);
}
// ================== CLASSES ========================
class SeqDB
{ // OPENS definition of SEQDB CLASS.
var $dbname;
var $data_fn;
var $data_fp;
var $dir_fn;
var $dir_fp;
var $seqptr;
var $seqcount;
var $dbformat;
// first() positions the sequence pointer (i.e. the seqptr property of a Seq object) to
// the first sequence in a database (SeqDB object).
function first()
{
$this->seqptr = 0;
}
// last() positions the sequence pointer (i.e. the seqptr property of a Seq object) to
// the last sequence in a database (SeqDB object).
function last()
{
$this->seqptr = $this->seqcount-1;
}
// prev() (short for previous) positions the sequence pointer (i.e. the seqptr property of
// a Seq object) to the sequence that comes before the current sequence.
function prev()
{
if ($this->seqptr > 0) $this->seqptr--;
}
// next() positions the sequence pointer (i.e. the seqptr property of a Seq object) to the
// sequence that comes after the current sequence.
function next()
{
if ($this->seqptr < $this->seqcount-1) $this->seqptr++;
}
// fetch() retrieves all data from the specified sequence record and returns them in the
// form of a Seq object. This method invokes one of several parser methods.
function fetch()
{
if ($this->data_fn == "") die("Cannot invoke fetch() method from a closed object.");
@$seqid = func_get_arg(0);
// IDX and DIR files remain open for the duration of the FETCH() method.
$fp = fopen($this->data_fn, "r");
$fpdir = fopen($this->dir_fn, "r");
if ($seqid != FALSE)
{
$idx_r = bsrch_tabfile($fp, 0, $seqid);
if ($idx_r == FALSE) return FALSE;
else $this->seqptr = $idx_r[3];
}
else
{
// For now, SEQPTR determines CURRENT SEQUENCE ID. Alternative is to track curr line.
fseekline($fp, $this->seqptr);
$idx_r = preg_split("/\s+/", trim(fgets($fp, 81)));
}
$dir_r = bsrch_tabfile($fpdir, 0, $idx_r[1]);
$fpseq = fopen($dir_r[1], "r");
fseekline($fpseq, $idx_r[2]);
$flines = line2r($fpseq);
$myseq = new seq();
if ($this->dbformat == "GENBANK")
$myseq = $this->parse_id($flines);
elseif ($this->dbformat == "SWISSPROT")
$myseq = $this->parse_swissprot($flines);
fclose($fp);
fclose($fpdir);
fclose($fpseq);
return $myseq;
}
// parse_swissprot() parses a Swissprot data file and returns a Seq object containing parsed data.
function parse_swissprot($flines)
{ // OPENS parse_swissprot() function
$accession = array();
$date_r = array();
$desc = "";
$desc_lnctr = 0;
$gename_r = array();
$os_r = array();
$os_linectr = 0;
$os_str = "";
$oc_linectr = 0;
$oc_str = "";
$ref_r = array();
$ra_r = array();
$ra_ctr = 0;
$ra_str = "";
$rl_ctr = 0;
$rl_str = "";
$db_r = array();
$ft_r = array();
$kw_str = "";
$kw_r = array();
while ( list($no, $linestr) = each($flines) )
{ // OPENS 1st (outermost) while ( list($no, $linestr) = each($flines) )
$linelabel = left($linestr, 2);
$linedata = trim(substr($linestr, 5));
$lineend = right($linedata, 1);
if (left($linestr, 2) == "ID")
{ // OPENS if (left($linestr, 2) == "ID")
$words = preg_split("/;/", substr($linestr, 5));
$endc = preg_split("/\s/", $words[0]);
$entry_name = $endc[0];
$namesrc = preg_split("/_/", $entry_name);
$protein_name = $namesrc[0];
$protein_source = $namesrc[1];
$data_class = $endc[1];
$moltype = $words[1];
$length = (int) substr($words[2], 0, strlen($words[2])-4);
} // CLOSES if (left($linestr, 2) == "ID")
if (left($linestr, 2) == "AC")
{ // OPENS if (left($linestr, 2) == "AC")
$accstr = $linedata;
$accstr = substr($accstr, 0, strlen($accstr)-1);
$accline = preg_split("/;/", intrim($accstr));
$accession = array_merge($accession, $accline);
} // CLOSES if (left($linestr, 2) == "AC")
if (left($linestr, 2) == "DT")
{ // OPENS if (left($linestr, 2) == "DT")
// DT DD-MMM-YEAR (REL. XX, COMMENT)
$datestr = $linedata;
$datestr = substr($datestr, 0, strlen($datestr)-1);
$words = preg_split("/\(/", $datestr);
// ( "DD-MMM-YEAR ", "REL. XX, COMMENT")
$firstcomma = strpos($words[1], ",");
$comment = trim(substr($words[1], $firstcomma+1));
// ( "CREATED" => (date, rel), "LAST SEQUENCE UPDATE" => (date, rel),
// "LAST ANNOTATION UPDATE" => (date, rel), COMMENT1 => (date, rel),
// "COMMENT2" => (date, rel), ... )
if ($comment == "CREATED")
{ // OPENS if ($comment == "CREATED")
// this DT line is a DATE CREATED line.
$create_date = substr($words[0], 0, 11);
$create_rel = substr($words[1], 5, ($firstcomma-5));
$date_r[$comment] = array($create_date, $create_rel);
} // CLOSES if ($comment == "CREATED")
elseif ($comment == "LAST SEQUENCE UPDATE")
{ // OPENS elseif ($comment == "LAST SEQUENCE UPDATE")
$sequpd_date = substr($words[0], 0, 11);
$sequpd_rel = substr($words[1], 5, ($firstcomma-5));
$date_r[$comment] = array($sequpd_date, $sequpd_rel);
} // CLOSES elseif ($comment == "LAST SEQUENCE UPDATE")
elseif ($comment == "LAST ANNOTATION UPDATE")
{ // OPENS elseif ($comment == "LAST ANNOTATION UPDATE")
$notupd_date = substr($words[0], 0, 11);
$notupd_rel = substr($words[1], 5, ($firstcomma-5));
$date_r[$comment] = array($notupd_date, $notupd_rel);
} // CLOSES elseif ($comment == "LAST ANNOTATION UPDATE")
else
{ // OPENS else part of if ($comment == "CREATED")
// For now, we do not check vs. duplicate comments.
// We just overwrite the older comment with new one.
$other_comment = $comment;
$other_date = substr($words[0], 0, 11);
$other_rel = substr($words[1], 5, ($firstcomma-5));
$date_r[$comment] = array($other_date, $other_rel);
} // CLOSES else part of if ($comment == "CREATED")
} // CLOSES if (left($linestr, 2) == "DT")
if (left($linestr, 2) == "DE")
{ // OPENS if (left($linestr, 2) == "DE")
$desc_lnctr++;
$linestr = $linedata;
if ($desc_lnctr == 1) $desc .= $linestr;
else $desc .= " " . $linestr;
// Checks if (FRAGMENT) or (FRAGMENTS) is found at the end
// of the DE line to determine if sequence is complete.
if (right($linestr, 1) == ".")
{ // OPENS if (right($linestr, 1) == ".")
if ( (strtoupper(right($linestr, 11)) == "(FRAGMENT).") or
(strtoupper(right($linestr, 12)) == "(FRAGMENTS).") )
$is_fragment = TRUE;
else $is_fragment = FALSE;
} // CLOSE if (right($linestr, 1) == ".")
} // CLOSES if (left($linestr, 2) == "DE")
if ($linelabel == "KW")
{
$kw_str .= $linedata;
if ($lineend == ".")
{
$kw_str = rem_right($kw_str);
$kw_r = preg_split("/;/", $kw_str);
array_walk($kw_r, "trim_element");
$kw_str = "";
}
}
if ($linelabel == "OS")
{ // OPENS if ($linelabel == "OS")
$os_linectr++;
if ($lineend != ".")
{ // we are not yet at the last OS line.
if ($os_linectr == 1) $os_str .= $linedata;
else $os_str .= " $linedata";
}
else
{ // we are at the last OS line.
$os_str .= " $linedata";
$os_str = rem_right($os_str);
$os_line = preg_split("/\, AND /", $os_str);
}
} // CLOSES if ($linelabel == "OS")
if ($linelabel == "OG")
$organelle = rem_right($linedata);
if ($linelabel == "OC")
{
$oc_linectr++;
if ($lineend != ".")
{ // we are not yet at the last OS line.
if ($oc_linectr == 1) $oc_str .= $linedata;
else $oc_str .= " $linedata";
}
else
{ // we are at the last OS line.
$oc_str .= " $linedata";
$oc_str = rem_right($oc_str);
$oc_line = preg_split("/;/", $oc_str);
array_walk($oc_line, "trim_element");
}
}
if ($linelabel == "FT")
{
$ft_key = trim(substr($linestr, 5, 8));
$ft_from = (int) trim(substr($linestr, 14, 6));
$ft_to = (int) trim(substr($linestr, 21, 6));
$ft_desc = rem_right(trim(substr($linestr, 34)));
$ft_r[] = array($ft_key, $ft_from, $ft_to, $ft_desc);
}
// ( rn => ( "rp" => "my rp", "rc" => ("tok1" => "value", ...) ) )
// ( 10 => ( "RP" => "my rp", "RC" => ("PLASMID" => "PLA_VAL", ... ) ) )
// Example: DR AARHUS/GHENT-2DPAGE; 8006; IEF.
if ($linelabel == "DR")
{
// DR DATA_BANK_IDENTIFIER; PRIMARY_IDENTIFIER; SECONDARY_IDENTIFIER
// We assume that all three data items are mandatory/present in all DR entries.
// ( refno => ( (dbname1, pid1, sid1), (dbname2, pid2, sid2), ... ), 1 => ( ... ) )
// ( 0 => ( (REBASE, pid1, sid1), (WORPEP, pid2, sid2), ... ), 1 => ( ... ) )
$linedata = rem_right($linedata);
$dr_line = preg_split("/;/", $linedata);
array_walk($dr_line, "trim_element");
$db_name = $dr_line[0];
$db_pid = $dr_line[1];
$db_sid = $dr_line[2];
$db_r[] = array($db_name, $db_pid, $db_sid);
}
if ($linelabel == "RN")
{ // OPENS "RN"
// Remove the [ and ] between the reference number.
$refno = substr(rem_right($linedata), 1);
$rc_ctr = 0;
$rc_str = "";
$rc_flag = FALSE;
$inner_r = array();
while ( list($no, $linestr) = each($flines) )
{ // OPENS 2nd WHILE
$linelabel = left($linestr, 2);
$linedata = trim(substr($linestr, 5));
$lineend = right($linedata, 1);
if ($linelabel == "RP") $inner_r["RP"] = $linedata;
elseif ($linelabel == "RC")
{ // OPENS elseif ($linelabel == "RC")
$rc_str .= $linedata;
while ( list($no, $linestr) = each($flines) )
{ // OPENS 3rd WHILE
$linelabel = left($linestr, 2);
$linedata = trim(substr($linestr, 5));
$lineend = right($linedata, 1);
if ($linelabel == "RC")
$rc_str .= " $linedata";
else
{ // opens else
prev($flines);
break;
} // closes else
} // CLOSES 3rd WHILE
// we remove the last character if it is ";"
$rc_str = trim($rc_str);
if (right($rc_str,1) == ";") $rc_str = rem_right($rc_str);
$rc_line = preg_split("/;/", trim($rc_str));
array_walk($rc_line, "trim_element");
$innermost = array();
foreach($rc_line as $tokval_str)
{
// here we assume that there is no whitespace
// before or after (left or right of) the "=".
$tokval_r = preg_split("/=/", $tokval_str);
$token = $tokval_r[0];
$value = $tokval_r[1];
$innermost[$token] = $value;
}
$inner_r["RC"] = $innermost;
} // CLOSES elseif ($linelabel == "RC")
elseif ($linelabel == "RM")
{ // We have no idea what RM is about, so we assume it's a single-line entry.
// which may occur 0 to 1 times inside a SWISSPROT SEQUENCE RECORD.
$inner_r["RM"] = $linedata;
}
elseif ($linelabel == "RX")
{
$linedata = rem_right($linedata);
$rx_line = preg_split("/;/", intrim($linedata));
$inner_r["RX_BDN"] = $rx_line[0];
$inner_r["RX_ID"] = $rx_line[1];
}
elseif ($linelabel == "RA")
{
$ra_ctr++;
if ($ra_ctr == 1) $ra_str = $linedata;
else $ra_str .= " $linedata";
if ($lineend == ";")
{
$ra_str = rem_right($ra_str);
$ra_r = preg_split("/\,/", $ra_str);
array_walk($ra_r, "trim_element");
$inner_r["RA"] = $ra_r;
}
}
elseif ($linelabel == "RL")
{
$rl_ctr++;
if ($rl_ctr == 1) $rl_str = $linedata;
else $rl_str .= " $linedata";
}
else
{
$inner_r["RL"] = $rl_str;
prev($flines);
break;
}
} // CLOSES 2nd WHILE
$ref_r[$refno-1] = $inner_r;
$ra_str = "";
$ra_ctr = 0;
$rl_str = "";
$rl_ctr = 0;
} // CLOSES "RN"
if (left($linestr, 2) == "GN")
{ // OPENS if (left($linestr, 2) == "GN")
// GN is always exactly one line.
// GNAME1 OR GNAME2 ( (GNAME1, GNAME2) )
// GNAME1 AND GNAME2 ( (GNAME1), (GNAME2) )
// GNAME1 AND (GNAME2 OR GNAME3) ( (GNAME1), (GNAME2, GNAME3) )
// GNAME1 OR (GNAME2 AND GNAME3) NOT POSSIBLE!!!
/* ALGORITHM:
1) Split expressions by " AND ".
2) Test each "token" if in between parentheses or not.
3) If not, then token is a singleton, else it's a multiple-ton.
4) Singletons are translated into (GNAME1).
Multiple-tons are translated into (GNAME1, GNAME 2).
5) Push gene name array into larger array. Go to next token.
*/
// Remove "GN " at the beginning of our line.
$linestr = trim(substr($linestr, 5));
// Remove the last character which is always a period.
$linestr = substr($linestr, 0, strlen($linestr)-1);
// Go here if you detect at least one ( or ).
if ( is_false(strpos($linestr, "(")) )
{ // GN Line does not contain any parentheses.
// Ergo, it is made up of all OR's or AND's but not both.
if (strpos($linestr, " OR ") != FALSE)
{
// Case 1: GNAME1 OR GNAME2.
$temp = preg_split("/ OR /", $linestr);
$gename_r[] = $temp;
}
elseif (strpos($linestr, " AND ") != FALSE)
{
// Case 2: GNAME1 AND GNAME2 AND GNAME3.
$temp = preg_split("/ AND /", $linestr);
foreach($temp as $gene)
$gename_r[] = array($gene);
}
else $gename_r[] = array($linestr);
// Case 0: GN GENENAME1. One gene name (no OR, AND).
}
else
{ // OPENS else part of if ( is_false(strpos($linestr, "(")) )
// GN Line contains at least one pair of parentheses.
// Case 3: GNAME1 AND (GNAME2 OR GNAME3) => ( (GNAME1), (GNAME2, GNAME3) )
// COMMENTS # 1 below.
$temp = preg_split("/ AND /", $linestr);
foreach($temp as $gene)
{ // OPENS foreach($temp as $gene)
if (substr($gene, 0, 1) == "(")
{ // a list of 2 or more gene names OR'ed together
// remove the "(" and ")" at both ends of the string.
$gene = substr($gene, 1);
$gene = substr($gene, 0, strlen($gene)-1);
$genelist = preg_split("/ OR /", $gene);
$gename_r[] = $genelist;
}
else
{ // singleton
$gename_r[] = array($gene);
}
} // CLOSES foreach($temp as $gene)
} // CLOSES else part of if ( is_false(strpos($linestr, "(")) )
} // CLOSES if (left($linestr, 2) == "GN")
// 0123456789012345678901234567890123456789
// SQ SEQUENCE XXXX AA; XXXXX MW; XXXXX CN;
if ($linelabel == "SQ")
{ // OPENS if ($linelabel == "SQ")
$linedata = rem_right($linedata);
// XXXX AA, XXXX MW, XXXX CN
$words = preg_split("/;/", substr($linedata, 8));
$aa = preg_split("/\s+/", trim($words[0]));
$aa_count = (int) trim($aa[0]);
$mw = preg_split("/\s+/", trim($words[1]));
$mol_wt = (int) trim($mw[0]);
$cn = preg_split("/\s+/", trim($words[2]));
$chk_no = trim($cn[0]);
$chk_method = trim($cn[1]);
$sequence = "";
while ( list($no, $linestr) = each($flines) )
{
$linelabel = left($linestr, 2);
if ($linelabel == "//") break;
$linedata = intrim(trim($linestr));
$sequence .= $linedata;
}
} // CLOSES if ($linelabel == "SQ")
} // CLOSES 1st (outermost) while ( list($no, $linestr) = each($flines) )
$seqobj = new seq();
$seqobj->id = $protein_name;
$seqobj->seqlength = $length;
$seqobj->moltype = $moltype;
$seqobj->date = $create_date;
$seqobj->accession = $accession[0];
array_shift($accession);
$seqobj->sec_accession = $accession;
$seqobj->source = $os_line;
$seqobj->organism = $oc_line;
$seqobj->sequence = $sequence;
$seqobj->definition = $desc;
$seqobj->keywords = $kw_r;
$genbank_ref_r = array();
$inner_r = array();
foreach($ref_r as $key => $value)
{
$inner_r["REFNO"] = $key;
$db_id = $value["RX_BDN"];
$inner_r[$db_id] = $value["RX_ID"];
$inner_r["REMARKS"] = $value["RP"];
$inner_r["COMMENT"] = $value["RC"];
$inner_r["TITLE"] = $value["RL"];
$inner_r["JOURNAL"] = $value["RL"];
$inner_r["AUTHORS"] = $value["RA"];
$genbank_ref_r[] = $inner_r;
}
$seqobj->reference = $genbank_ref_r;
$swiss = array();
$swiss["ID"] = $protein_name;
$swiss["PROT_NAME"] = $protein_name;
$swiss["MOL_TYPE"] = $moltype;
$swiss["PROT_SOURCE"] = $protein_source;
$swiss["DATA_CLASS"] = $data_class;
$swiss["LENGTH"] = $length;
$swiss["CREATE_DATE"] = $create_date;
$swiss["CREATE_REL"] = $create_rel;
$swiss["SEQUPD_DATE"] = $sequpd_date;
$swiss["SEQUPD_REL"] = $sequpd_rel;
$swiss["NOTUPD_DATE"] = $notupd_date;
$swiss["NOTUPD_REL"] = $notupd_rel;
// ACCESSION is an ARRAY.
$swiss["ACCESSION"] = $accession;
$swiss["PRIM_AC"] = $accession[0];
$swiss["DESC"] = $desc;
$swiss["IS_FRAGMENT"] = $is_fragment;
// KEYWORDS is an ARRAY.
$swiss["KEYWORDS"] = $kw_r;
// ORGANISM is an ARRAY.
$swiss["ORGANISM"] = $os_line;
$swiss["ORGANELLE"] = $organelle;
// FT_
|