BioPHP: PHP for Biocomputing


[ Home Page ] [ I/O Scripts Page ]

Source Code Listing of Still Other *.inc.php Files
(Parser for Still Other file formats)

Note: This is part of BioPHP 1.1 alpha code set. The code, which is approximately 1,500+ lines long, is still rough. It also depends on
two files, the alpha versions of "seqdb.inc.php" and "etc.inc.php",
which I will post shortly. Improvements to the code are welcome!
===================================================== PMD.INC.PHP - SOURCE CODE <?php require_once("etc.inc.php"); require_once("seqdb.inc.php"); require_once("seq.inc.php"); class Protein_PMD { var $entry_type; var $entry_no; var $mutation_type; var $article_no; var $authors; var $journal; var $medline_no; var $title; var $dbref; // structure unclear, implement later. var $protein; var $sequence; var $source; var $n_terminal; var $express_sys; var $change; // structure unclear, implement later. var $disease; // structure unclear, implement later. var $comment; } function parse_protein_pmd($flines) { // initialize variables here. $auth_flag = FALSE; $auth_string = ""; $aAuthors = array(); $jour_flag = FALSE; $jour_string = ""; $title_flag = FALSE; $title_string = ""; while ( list($no, $linestr) = each($flines) ) { $linelabel = trim(left($linestr, 16)); $linedata = trim(substr($linestr, 16)); /* ENTRY data field. Example: ENTRY A000300 - Artificial 2607383 Assume that ENTRY data field is always one line. Assume that ENTRY_TYPE and ENTRY_NO can be found at fixed positions in the line. Assume that all data items are mandatory (always appear in the ENTRY line). */ if ($linelabel == "ENTRY") { $entry_type = substr($linedata,0,1); $entry_no = substr($linedata,1,6); $entry_tokens = preg_split("/\s+/", substr($linedata,10), -1, PREG_SPLIT_NO_EMPTY); $mutation_type = trim($entry_tokens[0]); $article_no = trim($entry_tokens[1]); } /* AUTHORS data field Example: AUTHORS Shoshani I., Bianchi G., Desaubry L., Dessauer C.W. & Johnson R.A. */ if ($linelabel == "AUTHORS") { $auth_string = $linedata . " "; $auth_flag = TRUE; } elseif ( (strlen(trim($linelabel)) == 0) and ($auth_flag) ) $auth_string .= $linedata . " "; elseif ( (strlen(trim($linelabel)) > 0) and ($auth_flag) ) { $aAuthors = preg_split("/[\,\&]/", $auth_string, -1, PREG_SPLIT_NO_EMPTY); array_walk($aAuthors, "trim_element"); $auth_string = ""; $auth_flag = FALSE; } /* MEDLINE data field Example: MEDLINE 10666322 For now, assume that it's always exactly one entry (word). */ if ($linelabel == "MEDLINE") $medline_no = $linedata; /* JOURNAL data field Example: JOURNAL Arch.Biochem.Biophys. (2000) 374(2), 389-394 For now, let's just concatenate all the lines with space. We don't extract individual data items like journal title, publication year, etc. */ if ($linelabel == "JOURNAL") { $jour_string = $linedata . " "; $jour_flag = TRUE; } elseif ( (strlen(trim($linelabel)) == 0) and ($jour_flag) ) $jour_string .= $linedata . " "; elseif ( (strlen(trim($linelabel)) > 0) and ($jour_flag) ) { $journal = trim($jour_string); $jour_string = ""; $jour_flag = FALSE; } /* TITLE data field - handle the same way as JOURNAL. Example: TITLE Lys-Ala mutations of type I adenylyl cyclase result in altered susceptibility to inhibition by adenine nucleoside 3'-polyphosphates. */ if ($linelabel == "TITLE") { $title_string = $linedata . " "; $title_flag = TRUE; } elseif ( (strlen(trim($linelabel)) == 0) and ($title_flag) ) $title_string .= $linedata . " "; elseif ( (strlen(trim($linelabel)) > 0) and ($title_flag) ) { $title = trim($title_string); $title_string = ""; $title_flag = FALSE; } if ($linelabel == "///") break; } $oProtein = new Protein_PMD(); $oProtein->entry_type = $entry_type; $oProtein->entry_no = $entry_no; $oProtein->mutation_type = $mutation_type; $oProtein->article_no = $article_no; $oProtein->authors = $aAuthors; $oProtein->medline_no = $medline_no; $oProtein->journal = $journal; $oProtein->title = $title; return $oProtein; } ?> ===================================================== PRF.INC.PHP - SOURCE CODE <?php // prf.inc.php require_once("etc.inc.php"); require_once("seqdb.inc.php"); require_once("seq.inc.php"); class Protein_PRF { var $entry_code; var $entry_name; var $source; var $journal; var $authors; var $title; var $keywords; var $comment; var $dbref; var $sequence; } function parse_protein_prf($flines) { // initialize variables here. $auth_flag = FALSE; $auth_string = ""; $aAuthors = array(); $jour_flag = FALSE; $jour_string = ""; $title_flag = FALSE; $title_string = ""; $comm_flag = FALSE; $comm_string = ""; while ( list($no, $linestr) = each($flines) ) { $linelabel = trim(left($linestr, 12)); $linedata = trim(substr($linestr, 12)); /* (ENTRY) CODE data field - one entry (word) in one line, the entry code is 6-7 digits followed by 1-2 alpha letters. Example: CODE 0904306A */ if ($linelabel == "CODE") $entry_code = $linedata; /* (ENTRY) NAME data field - for now we only support the SUBUNIT and ISOTYPE subkeys/qualifiers, and not the "determine" subkey/qualifier which appears in the example below. Example: NAME interleukin 2 determine protein */ if ($linelabel == "NAME") $entry_name = $linedata; /* SOURCE data field - skip for now Example: SOURCE Homo sapiens cname man taxon Eucarya;Animalia;Metazoa;Chordata;Vertebrata;Gnathostomata; Mammalia;Eutheria;Primates;Catarrhini;Hominidae */ if ($linelabel == "SOURCE") { } /* JOURNAL data field Example: JOURNAL Nature(London), 302(5906),305-310(1983) For now, let's just concatenate all the lines with space. We don't extract individual data items like journal title, publication year, etc. */ if ($linelabel == "JOURNAL") { $jour_string = $linedata . " "; $jour_flag = TRUE; } elseif ( (strlen(trim($linelabel)) == 0) and ($jour_flag) ) $jour_string .= $linedata . " "; elseif ( (strlen(trim($linelabel)) > 0) and ($jour_flag) ) { $journal = trim($jour_string); $jour_string = ""; $jour_flag = FALSE; } /* AUTHORS data field Example: AUTHOR Taniguchi,T., Matsui,H., Fujita,T., Takaoka,C., Kashima,N., Yoshimoto,R., Hamuro,J. */ if ($linelabel == "AUTHOR") { $auth_string = $linedata . " "; $auth_flag = TRUE; } elseif ( (strlen(trim($linelabel)) == 0) and ($auth_flag) ) $auth_string .= $linedata . " "; elseif ( (strlen(trim($linelabel)) > 0) and ($auth_flag) ) { $temp = preg_split("/\.\,/", $auth_string, -1, PREG_SPLIT_NO_EMPTY); array_walk($temp, "trim_element"); $last_author = array_pop($temp); foreach($temp as $author) $aAuthors[] = "$author."; $aAuthors[] = $last_author; $auth_string = ""; $auth_flag = FALSE; } /* TITLE data field - multiline; handle the same way as JOURNAL. Example: TITLE Structure and expression of a cloned cDNA for human interleukin-2. */ if ($linelabel == "TITLE") { $title_string = $linedata . " "; $title_flag = TRUE; } elseif ( (strlen(trim($linelabel)) == 0) and ($title_flag) ) $title_string .= $linedata . " "; elseif ( (strlen(trim($linelabel)) > 0) and ($title_flag) ) { $title = trim($title_string); $title_string = ""; $title_flag = FALSE; } /* KEYWORD data field Example: KEYWORD Interleukin 2 Human Cloning From cDNA Library Seq Determination 812bp mRNA Hybridization Translation Expression in Monkey Cell 153AAs T Cell Growth Factor Stimulation of Thymidine Uptake */ if ($linelabel == "KEYWORD") { } /* COMMENT data field Example: COMMENT CHO.x2 hetero.x3 */ if ($linelabel == "COMMENT") { $comm_string = $linedata . " "; $comm_flag = TRUE; } elseif ( (strlen(trim($linelabel)) == 0) and ($comm_flag) ) $comm_string .= $linedata . " "; elseif ( (strlen(trim($linelabel)) > 0) and ($comm_flag) ) { $comment = trim($comm_string); $comm_string = ""; $comm_flag = FALSE; } /* CROSSREF data field Example: CROSSREF PIR=ICHU2;PIR=ICGI2 */ if ($linelabel == "CROSSREF") { } /* SEQUENCE data field Example: SEQUENCE MYRMQLLSCI ALSLALVTNS APTSSSTKKT QLQLEHLLLD LQMILNGINN YKNPKLTRML TFKFYMPKKA TELKHLQCLE EELKPLEEVL NLAQSKNFHL RPRDLISNIN VIVLELKGSE TTFMCEYADE TATIVEFLNR WITFCQSIIS TLT */ if ($linelabel == "SEQUENCE") { } if ($linelabel == "///") break; } $oProtein = new Protein_PRF(); $oProtein->entry_code = $entry_code; $oProtein->entry_name = $entry_name; $oProtein->journal = $journal; $oProtein->authors = $aAuthors; $oProtein->title = $title; // $oProtein->keywords = $aKeywords; $oProtein->comment = $comment; // $oProtein->dbref = $dbref; // $oProtein->sequence = $sequence; return $oProtein; } /* CODE 0904306A NAME interleukin 2 determine protein SOURCE Homo sapiens cname man taxon Eucarya;Animalia;Metazoa;Chordata;Vertebrata;Gnathostomata; Mammalia;Eutheria;Primates;Catarrhini;Hominidae JOURNAL Nature(London), 302(5906),305-310(1983) AUTHOR Taniguchi,T., Matsui,H., Fujita,T., Takaoka,C., Kashima,N., Yoshimoto,R., Hamuro,J. TITLE Structure and expression of a cloned cDNA for human interleukin-2. KEYWORD Interleukin 2 Human Cloning From cDNA Library Seq Determination 812bp mRNA Hybridization Translation Expression in Monkey Cell 153AAs T Cell Growth Factor Stimulation of Thymidine Uptake CROSSREF PIR=ICHU2;PIR=ICGI2 SEQUENCE MYRMQLLSCI ALSLALVTNS APTSSSTKKT QLQLEHLLLD LQMILNGINN YKNPKLTRML TFKFYMPKKA TELKHLQCLE EELKPLEEVL NLAQSKNFHL RPRDLISNIN VIVLELKGSE TTFMCEYADE TATIVEFLNR WITFCQSIIS TLT */ ?> ===================================================== PRINTS.INC.PHP - SOURCE CODE <?php require_once("etc.inc.php"); require_once("seq.inc.php"); require_once("seqdb.inc.php"); class PrintsMotif { var $entry_name; var $entry_type; var $create_date; var $upd_date; var $desc; } function parse_motif_prints($flines) { // Initialize variables (flags and string ) here. $desc_flag = FALSE; $desc_string = ""; while ( list($no, $linestr) = each($flines) ) { // OPENS 1st (outermost) while ( list($no, $linestr) = each($flines) ) $linelabel = left($linestr, 3); $linedata = trim(substr($linestr, 4)); // GC data field - seems to contain the entry name (one word?) in exactly one line. if ($linelabel == "gc;") $entry_name = $linedata; // GN data field - seems to contain the entry type (> 1 word) in exactly one line. if ($linelabel == "gn;") $entry_type = $linedata; // GA data field - DATE CREATED and UPDATED. Assume exactly one line. // Example: ga; 16-NOV-1995; UPDATE 06-JUN-1999 if ($linelabel == "ga;") { $date_tokens = preg_split("/;/", $linedata, -1, PREG_SPLIT_NO_EMPTY); $create_date = $date_tokens[0]; array_shift($date_tokens); foreach($date_tokens as $keyval) { $keyval_tokens = preg_split("/\s+/", $keyval, -1, PREG_SPLIT_NO_EMPTY); $key = $keyval_tokens[0]; // remove the first item from array (rep. the key name), leaving the key values. array_shift($keyval_tokens); // rebuild the value, joining them with a whitespace character. $val = implode(" ", $keyval_tokens); $aEntry[$key] = $val; } $upd_date = $aEntry["UPDATE"]; } // GD data field - DESCRIPTION entry - mostly multiline, connect with whitespace. if ($linelabel == "gd;") { $desc_string .= $linedata . " "; $desc_flag = TRUE; } elseif ($desc_flag) { $desc = trim($desc_string); $desc_flag = FALSE; $desc_string = ""; } } $oPrintsMotif = new PrintsMotif(); $oPrintsMotif->entry_name = $entry_name; $oPrintsMotif->entry_type = $entry_type; $oPrintsMotif->desc = $desc; $oPrintsMotif->create_date = $create_date; $oPrintsMotif->upd_date = $upd_date; return $oPrintsMotif; } // closes function parse_motif_prints() ?> ===================================================== PRODOM.INC.PHP - SOURCE CODE <?php // prodom.inc.php class ProtFam_Prodom { var $entry_no; var $accession; var $release; var $domain_count; var $freq_names; // an associative array e.g. array("FDAM" => 2, ...) var $keywords; // a simple 1D array e.g. array("DNA-BINDING", "PROTEASE", ...) } function parse_protfam_prodom($flines) { // initialize variables here while ( list($no, $linestr) = each($flines) ) { // OPENS 1st (outermost) while ( list($no, $linestr) = each($flines) ) $linelabel = left($linestr, 2); $linedata = trim(substr($linestr, 5)); $lineend = right($linedata, 1); /* ID - IDENTIFIER data field - contains the ENTRY_NO, RELEASE, and DOMAIN_COUNT data items. We assume here that all three data items are mandatory. Example: ID 20167 p2002.1 10 seq. */ if ($linelabel == "ID") { // we redefine $linedata for the ID line because it starts at position index 3 instead of 5. $linedata = trim(substr($linestr, 3)); $id_tokens = preg_split("/\s+/", $linedata, -1, PREG_SPLIT_NO_EMPTY); // "20167", "p2002.1", "10", "seq" $entry_no = trim($id_tokens[0]); // we remove the prefix "p" from the second token to get the RELEASE data item. $release = substr(trim($id_tokens[1]), 1); // we basically ignore the fourth token, which we assume is always "seq". $domain_count = (int) (trim($id_tokens[2])); } /* AC - ACCESSION data field - exactly one entry (word) in one line Example: AC PD266930 */ if ($linelabel == "AC") $accession = $linedata; /* KW - KEYWORD data field Syntax: KW [FREQUENT_NAME(OCCURRENCE)...] // KEYWORD [KEYWORD ...] Example: KW FADR(2) Y586(1) // COMPLETE PROTEOME DNA-BINDING FATTY TRANSCRIPTION REGULATION METABOLISM REGULATOR ACID ACTIVATOR */ if ($linelabel == "KW") { $kw_tokens = preg_split("/\/\//", $linedata, -1, PREG_SPLIT_NO_EMPTY); // E.g. $kw_tokens is "FADR(2) Y586(1)", "COMPLETE PROTEOME DNA-BINDING..." $freqnames = trim($kw_tokens[0]); $freqname_tokens = preg_split("/\s+/", $freqnames, -1, PREG_SPLIT_NO_EMPTY); // E.g. $freqname_tokens is array( "FADR(2)", "Y586(1))" ) // Because we use \s+ as the separator, we are sure that each element in $freqname_tokens array // has no trailing/leading whitespaces, so no need to array_walk(..., "trim_element") it. $aFreqNames = array(); foreach($freqname_tokens as $seqname) { $seqname_tokens = preg_split("/\(/", $seqname, -1, PREG_SPLIT_NO_EMPTY); // e.g. "FADR", "2)" $seqname = $seqname_tokens[0]; $seqfreq = (int) (substr($seqname_tokens[1], 0, strlen($seqname_tokens[1])-1)); // we store $seqname and $seqfreq in an associative array called $aFreqNames; $aFreqNames[$seqname] = $seqfreq; } $aKeywords = preg_split("/\s+/", trim($kw_tokens[1]), -1, PREG_SPLIT_NO_EMPTY); } if ($linelabel == "//") break; } $oProtFam = new ProtFam_Prodom(); $oProtFam->entry_no = $entry_no; $oProtFam->accession = $accession; $oProtFam->release = $release; $oProtFam->domain_count = $domain_count; $oProtFam->freq_names = $aFreqNames; $oProtFam->keywords = $aKeywords; return $oProtFam; } /* ID 20167 p2002.1 10 seq. AC PD266930 KW FADR(2) Y586(1) // COMPLETE PROTEOME DNA-BINDING FATTY TRANSCRIPTION REGULATION METABOLISM REGULATOR ACID ACTIVATOR LA 74 ND 10 CC -!- DIAMETER: 119 PAM CC -!- RADIUS OF GYRATION: 53 PAM CC -!- SEQUENCE CLOSEST TO CONSENSUS: Q8ZEL9_YERPE 5-78 (distance:15 PAM) DC This family was generated by psi-blast, with a profile built from the seed aligment of the following SCOP FAMILY DC a.4.5.6 AL P09371|FADR_ECOLI 4 77 0.22 AQSPAGFAEEYIIESIWNNRFPPGTILPAERELSELIGVTRTTLREVLQRLARDGWLTIQHGKPTKVNNFWETS AL Q8ZP15|Q8ZP15_SALTY 5 78 0.22 AQSPAGFAEEYIIESIWNNRFPPGTILPAERELSELIGVTRTTLREVLQRLARDGWLTIQHGKPTKVNNFWETS AL Q8ZEL9|Q8ZEL9_YERPE 5 78 0.22 AQSPAGFAEEYIIESIWNNRFPPGSILPAERELSELIGVTRTTLREVLQRLARDGWLTIQHGKPTKVNNFWETS AL Q8Z685|Q8Z685_SALTI 5 78 0.35 AQSPAGFAEEYIIESIWNNCFPPGTILPAERELSELIGVTRTTLREVLQRLARDGWLTIQHGKPTKVNNFWETS AL Q9KQU8|Q9KQU8_VIBCH 5 78 0.62 AKSPAGFAEKYIIESIWNGRFPPGSILPAERELSELIGVTRTTLREVLQRLARDGWLTIQHGKPTKVNQFMETS AL Q9CPJ0|Q9CPJ0_PASMU 10 83 0.77 AQSPAGLAEEYIVRSIWNNHFPPGSDLPAERELAEKIGVTRTTLREVLQRLARDGWLNIQHGKPTKVNNIWETS AL P44705|FADR_HAEIN 10 81 1.08 AQSPAALAEEYIVKSIWQDVFPAGSNLPSERDLADKIGVTRTTLREVLQRLARDGWLTIQHGKPTKVNNIWD.. AL O07792|Y586_MYCTU 17 77 2.08 .........EQIATDVLTGEMPPGEALPSERRLAELLGVSRPAVREALKRLSAAGLVEVRQGDVTTVRDF.... AL Q11159|Y494_MYCTU 27 77 2.21 ...........IADAILDGVFPPGSTLPPERDLAERLGVNRTSLRQGLARLQQMGLIEVRHG............ AL Q8XFI2|Q8XFI2_SALTY 59 109 2.23 ...........IIKLINDNIFPPGTFLPPERELAKQLGVSRASLREALIVLEISGWIVIQSG............ CO AQSPAGFAEEYIVKSIWDGVFPPGSTLPPERELAERLGVSRTSLREALQRLERDGWIEIQHGKPTKVNNFWETS DR INTERPRO; IPR000524 "Bacterial regulatory proteins, GntR" DR PfamA; PF00392 gntR DR PROSITE; PS00043 PDOC00042 HTH_GNTR_FAMILY (27-51) DR PDB; 1H9T chain B (5-78) Q8ZP15_SALTY (5-78),1HW1 chain A (5-78),1HW1 chain B (5-78) DR PDB; 1H9T chain A (5-78) Q8ZP15_SALTY (5-78) // */ ?> ===================================================== REFSEQ.INC.PHP - SOURCE CODE <?php // refseq.inc.php // SeqAlign() is the constructor method for the SeqAlign class. It initializes class properties. function SeqAlign($filename = "", $format = "FASTA") { // OPENS function SeqAlign if (strlen($filename) == 0) { $this->seq_count = 0; $this->length = 0; $this->seqptr = 0; $this->gap_count = 0; $this->is_flush = TRUE; $this->seqset = array(); return; } if ($format == "FASTA") { $flines = file($filename); $seqctr = 0; $maxlen = 0; $maxctr = 0; $gapctr = 0; $this->seqset = array(); $samelength = TRUE; while ( list($no, $linestr) = each($flines) ) { // OPENS while ( list($no, $linestr) = each($flines) ) if (substr($linestr, 0, 1) == ">") { // start of a new sequence $seqctr++; $seqlen = strlen($seqstr); $seq_obj = new seq(); $seq_obj->id = $prev_id; $seq_obj->length = $seqlen; $seq_obj->sequence = $seqstr; $seq_obj->start = $prev_start; $seq_obj->end = $prev_end; $localgaps = $seq_obj->symfreq("-"); $gapctr += $seq_obj->symfreq("-"); if ($seqctr > 1) { if ($seqlen > $maxlen) $maxlen = $seqlen; if (($seqctr >= 3) and ($seqlen != $prev_len)) $samelength = FALSE; array_push($this->seqset, $seq_obj); } $seqstr = ""; $words = preg_split("/[\>\/]/", substr($linestr, 1)); $prev_id = $words[0]; $indexes = preg_split("/-/", $words[1]); $prev_start = $indexes[0]; $prev_end = $indexes[1]; $prev_len = $seqlen; continue; } else { $seqstr = $seqstr . trim($linestr); } } // CLOSES while ( list($no, $linestr) = each($flines) ) $seqlen = strlen($seqstr); $seq_obj = new seq(); $seq_obj->id = $prev_id; $seq_obj->start = $prev_start; $seq_obj->end = $prev_end; $seq_obj->length = $seqlen; $seq_obj->sequence = $seqstr; $localgaps = $seq_obj->symfreq("-"); $gapctr += $seq_obj->symfreq("-"); if ($seqctr > 1) { if ($seqlen > $maxlen) $maxlen = $seqlen; if (($seqctr >= 3) and ($seqlen != $prev_len)) $samelength = FALSE; array_push($this->seqset, $seq_obj); } $this->seq_count = $seqctr; $this->length = $maxlen; $this->seqptr = 0; $this->gap_count = $gapctr; $this->is_flush = $samelength; } // CLOSES if ($format == "FASTA") elseif ($format == "CLUSTAL") { // OPENS elseif ($format == "CLUSTAL") $flines = file($filename); $namelist = array(); $conserve_line = ""; $linectr = 0; while( list($no, $linestr) = each($flines) ) { // OPENS while( list($no, $linestr) = each($flines) ) $linectr++; if ($linectr == 1) continue; // skip the first line. if (strlen(trim($linestr)) == 0) continue; // ignore blank lines. $seqname = trim(substr($linestr, 0, 16)); $seqline = substr($linestr, 16, 60); if (strlen(trim($seqname)) == 0) { $conserve_line .= substr($seqline, 0, $lastlen); continue; } if (in_array($seqname, $namelist) == FALSE) { $namelist[] = $seqname; $seq[$seqname] = $seqline; $lastlen = strlen(trim($seqline)); } else { $seq[$seqname] .= trim($seqline); $lastlen = strlen(trim($seqline)); } } // CLOSES while( list($no, $linestr) = each($flines) ) $this->seqset = array(); $gapctr = 0; foreach($seq as $key => $value) { $seq_obj = new seq(); $seq_obj->id = $key; $seq_obj->length = strlen($value); $seq_obj->sequence = $value; $seq_obj->start = 0; $seq_obj->end = $seq_obj->length - 1; $gapctr += $seq_obj->symfreq("-"); array_push($this->seqset, $seq_obj); } $this->seq_count = count($namelist); $this->length = strlen($conserve_line); $this->seqptr = 0; $this->gap_count = $gapctr; $this->is_flush = TRUE; } // CLOSES elseif ($format == "CLUSTAL") } // CLOSES function SeqAlign ?> ===================================================== TRANSFAC.INC.PHP - SOURCE CODE <?php require_once("etc.inc.php"); require_once("seq.inc.php"); class TFMatrix { var $accession; var $id; var $date_created; var $date_updated; var $bnd_factor; var $desc; var $linked_factors; var $matrix; var $stat_basis; var $comments; var $ref_no; var $ref_author; var $ref_title; var $ref_data; } // parse_tfmatrix_transfac() parses MATRIX.DAT (Transfac) and returns a TFMATRIX object containing parsed data. function parse_tfmatrix_transfac($flines) { $cc_flag = FALSE; $cc_string = ""; while ( list($no, $linestr) = each($flines) ) { // OPENS 1st (outermost) while ( list($no, $linestr) = each($flines) ) $linelabel = left($linestr, 2); $linedata = trim(substr($linestr, 4)); $lineend = right($linedata, 1); // ID - IDENTIFICATION data field - one string in one line. if ($linelabel == "ID") $id = $linedata; // AC - ACCESSION NO data field - one string in one line. if ($linelabel == "AC") $accession = $linedata; /* DT - DATE data field - usually comes in two lines, the first is the date created, and the second, the date updated. Example: DT 20.06.90 (created); ewi. DT 24.08.95 (updated); hiwi. */ if ($linelabel == "DT") { // assume "created", "updated" appear in lowercase at fixed position in DT line. $type = substr($linedata,10,7); if ($type == "created") $date_created = substr($linedata,0,8); if ($type == "updated") $date_updated = substr($linedata,0,8); } // DE - DESCRIPTION data field. From sample data, it appears to be one line only. if ($linelabel == "DE") $desc = $linedata; /* CC - COMMENTS data field - assume to be one or more lines to be concatenated by a whitespace character. Example: CC Group I in [903]; 5 sites selected in vitro for binding to E12N CC (=N-terminally truncated E12); matrix corrected according to CC the published sequences */ if ($linelabel == "CC") { $cc_string .= $linedata . " "; $cc_flag = TRUE; } elseif ($cc_flag) { $comments = trim($cc_string); $cc_flag = FALSE; } if ($linelabel == "//") break; } // CLOSES 1st (outermost) while ( list($no, $linestr) = each($flines) ) $oTFMatrix = new TFMatrix(); $oTFMatrix->accession = $accession; $oTFMatrix->id = $id; $oTFMatrix->date_created = $date_created; $oTFMatrix->date_updated = $date_updated; $oTFMatrix->desc = $desc; $oTFMatrix->comments = $comments; return $oTFMatrix; } // CLOSES parse_tfmatrix_transfac() function class TFGene { var $accession; var $id; var $date_created; var $date_updated; var $desc_short; var $desc_long; var $organism; var $species; var $tax_class; var $bucher_class; var $tfsite_pos; var $tfsite_accno; var $compel_accno; var $trrd_accno; } // parse_tfgene_transfac() parses GENE.DAT (Transfac) and returns a TFGENE object containing parsed data. function parse_tfgene_transfac($flines) { $tax_flag = FALSE; $tax_string = ""; $aCompel = array(); while ( list($no, $linestr) = each($flines) ) { // OPENS 1st (outermost) while ( list($no, $linestr) = each($flines) ) $linelabel = left($linestr, 2); $linedata = trim(substr($linestr, 4)); $lineend = right($linedata, 1); // ID - IDENTIFICATION data field - one string in one line. if ($linelabel == "ID") $id = $linedata; // AC - ACCESSION NO data field - one string in one line. if ($linelabel == "AC") $accession = $linedata; /* DT - DATE data field - usually comes in two lines, the first is the date created, and the second, the date updated. Example: DT 20.06.90 (created); ewi. DT 24.08.95 (updated); hiwi. */ if ($linelabel == "DT") { // assume "created", "updated" appear in lowercase at fixed position in DT line. $type = substr($linedata,10,7); if ($type == "created") $date_created = substr($linedata,0,8); if ($type == "updated") $date_updated = substr($linedata,0,8); } // SD - SHORT DESCRIPTION data field. From sample data, it appears to be one line only. if ($linelabel == "SD") $desc_short = $linedata; // DE - LONG DESCRIPTION/GENE NAME data field. From sample data, it appears to be one line only. if ($linelabel == "DE") $desc_long = $linedata; // OS - ORGANISM SPECIES data field - assume to be always one line of this form (same as in class Factor): // Syntax: OS common_name, scientific_name. // Example: OS human, homo sapiens // Output: $organism = "human" // $species = "homo sapiens" if ($linelabel == "OS") { $org_tokens = preg_split("/,/", $linedata, -1, PREG_SPLIT_NO_EMPTY); array_walk($org_tokens, "trim_element"); $organism = $org_tokens[0]; $species = $org_tokens[1]; } // OC - ORGANISM CLASSIFICATION data field - assume to be always one line of this form (same as class Factor): // Syntax: OC kingdom; phylum; class; ...; // Example: // OC eukaryota; animalia; metazoa; chordata; vertebrata; // OC tetrapoda; mammalia; eutheria; primates // Output: $tax_class = array("eukaryota", "mammalia", ...) // Later, convert this into an associative array. Same goes for GenBank, etc. - Serge if ($linelabel == "OC") { $tax_string .= $linedata . " "; $tax_flag = TRUE; } elseif ($tax_flag) { $tax_string = trim($tax_string); $tax_tokens = preg_split("/;/", $tax_string, -1, PREG_SPLIT_NO_EMPTY); array_walk($tax_tokens, "trim_element"); $tax_flag = FALSE; } /* CO - COMPEL ACCESSION NO data field. From data, one entry (word) in one line, multiple lines. Example: CO C00001 CO C00005 CO C00006 */ if ($linelabel == "CO") $aCompel[] = $linedata; if ($linelabel == "//") break; } // CLOSES 1st (outermost) while ( list($no, $linestr) = each($flines) ) $oTFGene = new TFGene(); $oTFGene->accession = $accession; $oTFGene->id = $id; $oTFGene->date_created = $date_created; $oTFGene->date_updated = $date_updated; $oTFGene->desc_short = $desc_short; $oTFGene->desc_long = $desc_long; $oTFGene->organism = $organism; $oTFGene->species = $species; $oTFGene->tax_class = $tax_tokens; $oTFGene->compel_accno = $aCompel; return $oTFGene; } // CLOSES parse_tfgene_transfac() function class TFClass { var $accession; var $id; var $date_created; var $date_updated; var $class; var $struct_desc; var $comments; var $member_factors; var $ref_no; var $ref_author; var $ref_title; var $ref_data; var $dbref; } // parse_tfclass_transfac() parses CLASS.DAT (Transfac) and returns a TFCLASS object containing parsed data. function parse_tfclass_transfac($flines) { $class_flag = FALSE; $class_string = ""; while ( list($no, $linestr) = each($flines) ) { // OPENS 1st (outermost) while ( list($no, $linestr) = each($flines) ) $linelabel = left($linestr, 2); $linedata = trim(substr($linestr, 4)); $lineend = right($linedata, 1); // ID - IDENTIFICATION data field - one string in one line. if ($linelabel == "ID") $id = $linedata; // AC - ACCESSION NO data field - one string in one line. if ($linelabel == "AC") $accession = $linedata; /* DT - DATE data field - usually comes in two lines, the first is the date created, and the second, the date updated. Example: DT 20.06.90 (created); ewi. DT 24.08.95 (updated); hiwi. */ if ($linelabel == "DT") { // assume "created", "updated" appear in lowercase at fixed position in DT line. $type = substr($linedata,10,7); if ($type == "created") $date_created = substr($linedata,0,8); if ($type == "updated") $date_updated = substr($linedata,0,8); } // CL - CLASS data field - assume to be one or more lines, each entry separated by ; // Example: CL zinc cluster; zinc-cysteine cluster; C6 zinc finger // Output: ( "zinc cluster", "zinc-cystein cluster", ... ) if ($linelabel == "CL") { $class_string .= $linedata . " "; $class_flag = TRUE; } elseif ($class_flag) { $class_string = trim($class_string); if (strpos($class_string, ";") > 0) { $class_tokens = preg_split("/;/", $class_string); array_walk($class_tokens, "trim_element"); // Later, look into possibility that some elements of $class_tokens array might // contain special characters like ', \, /, etc. - Serge } else $class_tokens = array($class_string); $class_flag = FALSE; } /* CC - COMMENTS data field - assume to be one or more lines to be concatenated by a whitespace character. Example: CC Zinc finger motif of GATA-type. Two such motifs are present CC in each molecule. Each finger comprises 4 cysteine residues CC presumably coordinating one zinc ion. However, metal chelators CC do not suppress DNA-binding */ if ($linelabel == "CC") { $cc_string .= $linedata . " "; $cc_flag = TRUE; } elseif ($cc_flag) { $comments = trim($cc_string); $cc_flag = FALSE; } if ($linelabel == "//") break; } // CLOSES 1st (outermost) while ( list($no, $linestr) = each($flines) ) $oTFClass = new TFClass(); $oTFClass->accession = $accession; $oTFClass->id = $id; $oTFClass->date_created = $date_created; $oTFClass->date_updated = $date_updated; $oTFClass->class = $class_tokens; $oTFClass->comments = $comments; return $oTFClass; } // CLOSES parse_tfclass_transfac() function class Cell { var $accession; var $id; var $date_created; var $date_updated; var $author; var $organism; var $factor_src; var $desc; } // parse_cell_transfac() parses CELL.DAT (Transfac) and returns a CELL object containing parsed data. function parse_cell_transfac($flines) { $cd_flag = FALSE; $cd_string = ""; while ( list($no, $linestr) = each($flines) ) { // OPENS 1st (outermost) while ( list($no, $linestr) = each($flines) ) $linelabel = left($linestr, 2); $linedata = trim(substr($linestr, 4)); $lineend = right($linedata, 1); // ID - IDENTIFICATION data field - one string in one line. if ($linelabel == "ID") $id = $linedata; // AC - ACCESSION NO data field - one string in one line. if ($linelabel == "AC") $accession = $linedata; /* DT - DATE data field - usually comes in two lines, the first is the date created, and the second, the date updated. Example: DT 20.06.90 (created); ewi. DT 24.08.95 (updated); hiwi. */ if ($linelabel == "DT") { // assume "created", "updated" appear in lowercase at fixed position in DT line. $type = substr($linedata,10,7); if ($type == "created") $date_created = substr($linedata,0,8); if ($type == "updated") $date_updated = substr($linedata,0,8); } // OS - ORGANISM SPECIES data field - assume to be always one line of this form: // Syntax: OS common_name // Example: OS human // Output: $organism = "human" // Note: This is like the OS field in the FACTOR class minus the SPECIES (sci name). if ($linelabel == "OS") $organism = $linedata; // SO - FACTOR SOURCE data field. Assume to be one line. if ($linelabel == "SO") $factor_src = $linedata; // CD - CELL DESCRIPTION data field - may be one or more lines, to be concatenated // with a whitespace between lines. if ($linelabel == "CD") { $cd_string .= $linedata . " "; $cd_flag = TRUE; } elseif ($cd_flag) { $cd_string = trim($cd_string); $cd_flag = FALSE; } if ($linelabel == "//") break; } // CLOSES 1st (outermost) while ( list($no, $linestr) = each($flines) ) $oCell = new Cell(); $oCell->accession = $accession; $oCell->id = $id; $oCell->date_created = $date_created; $oCell->date_updated = $date_updated; $oCell->organism = $organism; $oCell->factor_src = $factor_src; $oCell->desc = $cd_string; return $oCell; } // CLOSES parse_cell_transfac() function class Factor { var $accession; var $id; var $date_created; var $date_updated; var $author; var $factor_name; var $synonyms; var $organism; // "organism" here refers to the common name. var $species; // "species" is the scientific name of "organism". var $tax_class; var $homologs; var $class_accno; var $class_id; var $class_decno; var $length; var $molwt; var $sequence; var $seq_comment; var $features; var $feat_struct; var $cell_spec_pos; var $cell_spec_neg; var $feat_func; var $inter_fact; var $matrix; var $bndsite_accno; var $bndsite_id; var $bndsite_quality; var $bndsite_species; var $ref_no; var $ref_author; var $ref_title; var $ref_data; var $dbref; } // parse_factor_transfac() parses FACTOR.DAT (Transfac) and returns a Site object containing parsed data. function parse_factor_transfac($flines) { $desc_flag = FALSE; $desc_string = ""; $region_flag = FALSE; $region_string = ""; $syn_flag = FALSE; $syn_string = ""; $homo_flag = FALSE; $homo_string = ""; $tax_flag = FALSE; $tax_string = ""; while ( list($no, $linestr) = each($flines) ) { // OPENS 1st (outermost) while ( list($no, $linestr) = each($flines) ) $linelabel = left($linestr, 2); $linedata = trim(substr($linestr, 4)); $lineend = right($linedata, 1); // ID - IDENTIFICATION data field - one string in one line. (Same as SITE) if ($linelabel == "ID") $id = $linedata; // AC - ACCESSION NO data field - one string in one line. (Same as SITE) if ($linelabel == "AC") $accession = $linedata; /* DT - DATE data field - usually comes in two lines, the first is the date/time created, and the second, the date updated. Example: DT 20.06.90 11:00:03 (created); ewi. DT 24.08.95 (updated); hiwi. I've modified the code to allow a TIME entry after the DATE. Later, update the code for the DT field of class SITE. - Serge */ if ($linelabel == "DT") { // Assume "(created)", "(updated)" appear after the date/time, not in fixed position. $date_tokens = preg_split("/\s+/", $linedata, -1, PREG_SPLIT_NO_EMPTY); if (count($date_tokens) > 3) { // The line contains the TIME entry as its second token. $type = $date_tokens[2]; if ($type == "(created);") $date_created = $date_tokens[0] . " " . $date_tokens[1]; if ($type == "(updated);") $date_updated = $date_tokens[0] . " " . $date_tokens[1]; } else { // The line does not contain the TIME entry as its second token. $type = $date_tokens[1]; if ($type == "(created);") $date_created = $date_tokens[0]; if ($type == "(updated);") $date_updated = $date_tokens[0]; } } // FA - FACTOR NAME data field - assume one string in one line. if ($linelabel == "FA") $factor_name = $linedata; // SY - SYNONYMS data field - assume to be one or more lines, each entry separated by ; // Example: SY AGP/EBP; ANF-2; CRP2; H-APF-2; IL-6DBP; LAP; LAP1; NF-IL6; NF-M; // Output: ( "AGP/EBP", "ANF-2", ... ) if ($linelabel == "SY") { $syn_string .= $linedata . " "; $syn_flag = TRUE; } elseif ($syn_flag) { $syn_string = trim($syn_string); $syn_tokens = preg_split("/;/", $syn_string, -1, PREG_SPLIT_NO_EMPTY); array_walk($syn_tokens, "trim_element"); // Later, look into possibility that some elements of $syn_tokens array might // contain special characters like ', \, /, etc. - Serge $syn_flag = FALSE; } // OS - ORGANISM SPECIES data field - assume to be always one line of this form: // Syntax: OS common_name, scientific_name. // Example: OS human, homo sapiens // Output: $organism = "human" // $species = "homo sapiens" if ($linelabel == "OS") { $org_tokens = preg_split("/,/", $linedata, -1, PREG_SPLIT_NO_EMPTY); array_walk($org_tokens, "trim_element"); $organism = $org_tokens[0]; $species = $org_tokens[1]; } // OC - ORGANISM CLASSIFICATION data field - assume to be always one line of this form: // Syntax: OC kingdom; phylum; class; ...; // Example: // OC eukaryota; animalia; metazoa; chordata; vertebrata; // OC tetrapoda; mammalia; eutheria; primates // Output: $tax_class = array("eukaryota", "mammalia", ...) // Later, convert this into an associative array. Same goes for GenBank, etc. - Serge if ($linelabel == "OC") { $tax_string .= $linedata . " "; $tax_flag = TRUE; } elseif ($tax_flag) { $tax_string = trim($tax_string); $tax_tokens = preg_split("/;/", $tax_string, -1, PREG_SPLIT_NO_EMPTY); array_walk($tax_tokens, "trim_element"); $tax_flag = FALSE; } // HO - HOMOLOGS data field - assume to be multiple entries separated by comma, // may span one or more lines to be concatenated by a whitespace. if ($linelabel == "HO") { $homo_string .= $linedata . " "; $homo_flag = TRUE; } elseif ($homo_flag) { $homo_tokens = preg_split("/,/", trim($homo_string), -1, PREG_SPLIT_NO_EMPTY); array_walk($homo_tokens, "trim_element"); $homo_flag = FALSE; } // CL - CLASS data field. Always one line with 3 entries sep by a ; // Example: CL C0001; CH; 2.3.3.0.1. // Output: $class_accno = "C0001", $class_id = "CH", $class_decno = "2.3.3.0.1." if ($linelabel == "CL") { $class_tokens = preg_split("/;/", $linedata, -1, PREG_SPLIT_NO_EMPTY); array_walk($class_tokens, "trim_element"); } if ($linelabel == "//") break; } // CLOSES 1st (outermost) while ( list($no, $linestr) = each($flines) ) $oFactor = new Factor(); $oFactor->accession = $accession; $oFactor->id = $id; $oFactor->date_created = $date_created; $oFactor->date_updated = $date_updated; $oFactor->factor_name = $factor_name; $oFactor->synonyms = $syn_tokens; $oFactor->organism = $organism; $oFactor->species = $species; $oFactor->homolog = $homo_tokens; $oFactor->tax_class = $tax_tokens; $oFactor->class_accno = $class_tokens[0]; $oFactor->class_id = $class_tokens[1]; $oFactor->class_decno = $class_tokens[2]; return $oFactor; } // CLOSES parse_site_transfac() function class Site { var $accession; var $id; var $date_created; var $date_updated; var $author; var $seqtype; var $desc; var $gene_region; var $regel_seq; var $denom; var $firstpos; var $lastpos; var $firstpos_def; var $bind_factor; var $organism; var $tax_class; var $factor_src; var $method; var $comments; var $dbref; var $refno; var $ref_author; var $ref_title; var $ref_data; } // closes CLASS SITE // parse_site_transfac() parses SITE.DAT (Transfac) and returns a Site object containing parsed data. function parse_site_transfac($flines) { $desc_flag = FALSE; $desc_string = ""; $region_flag = FALSE; $region_string = ""; while ( list($no, $linestr) = each($flines) ) { // OPENS 1st (outermost) while ( list($no, $linestr) = each($flines) ) $linelabel = left($linestr, 2); $linedata = trim(substr($linestr, 4)); $lineend = right($linedata, 1); // ID - IDENTIFICATION data field - one string in one line. if ($linelabel == "ID") $id = $linedata; // AC - ACCESSION NO data field - one string in one line. if ($linelabel == "AC") $accession = $linedata; /* DT - DATE data field - usually comes in two lines, the first is the date created, and the second, the date updated. Example: DT 20.06.90 (created); ewi. DT 24.08.95 (updated); hiwi. */ if ($linelabel == "DT") { // assume "created", "updated" appear in lowercase at fixed position in DT line. $type = substr($linedata,10,7); if ($type == "created") $date_created = substr($linedata,0,8); if ($type == "updated") $date_updated = substr($linedata,0,8); } // TY - SEQUENCE TYPE data field - one string (one letter?) in one line. // Example: TY D if ($linelabel == "TY") $seqtype = $linedata; // DE - DESCRIPTION data field - from sample data, it seems always one line. // Assume may be one or more lines concatenated with a whitespace char. if ($linelabel == "DE") { $desc_string .= $linedata . " "; $desc_flag = TRUE; } elseif ($desc_flag) { $desc_string = trim($desc_string); $desc_flag = FALSE; } // RE - GENE REGION data field - from sample data, it seems always one line. // Assume may be one or more lines concatenated with a whitespace char. // Example: RE intron promoter if ($linelabel == "RE") { $region_string .= $linedata . " "; $region_flag = TRUE; } elseif ($region_flag) { $region_string = trim($region_string); $region_flag = FALSE; } // "//" - END OF RECORD MARKER if ($linelabel == "//") break; } // CLOSES 1st (outermost) while ( list($no, $linestr) = each($flines) ) $oSite = new Site(); $oSite->accession = $accession; $oSite->id = $id; $oSite->date_created = $date_created; $oSite->date_updated = $date_updated; $oSite->seqtype = $seqtype; $oSite->desc = $desc_string; $oSite->gene_region = $region_string; return $oSite; } // CLOSES parse_site_transfac() function ?> ===================================================== UNIGENE.INC.PHP - SOURCE CODE <?php //pdbstr.inc.php require_once("etc.inc.php"); require_once("seqdb.inc.php"); require_once("seq.inc.php"); class Gene_Unigene { // MEMBER section var $entry_id; var $title; var $seq_count; } function parse_gene_unigene($flines) { // initialize variables here $title_flag = FALSE; $title_string = ""; while ( list($no, $linestr) = each($flines) ) { $linelabel = trim(substr($linestr,0,12)); $linedata = trim(substr($linestr,12)); // ID data field - from observation, one entry (word) in one line. // Example: ID Sbi.1 if ($linelabel == "ID") $entry_id = $linedata; /* TITLE data field - assume to be multiline. Example: TITLE ESTs, Moderately similar to putative pyrophosphate-fructose-6-phosphate 1-phosphotransferase [Arabidopsis thaliana] [A.thaliana] */ if ($linelabel == "TITLE") { $title_string .= $linedata . " "; $title_flag = TRUE; } elseif ($title_flag) { $title = trim($title_string); $title_string = ""; $title_flag = FALSE; } /* EXPRESS data field Example: EXPRESS Embryos germinated for 24 hr ; 10- to 14-day-old light-grown (greenhouse) seedlings ; Mix of ovaries of varying immature stages from 8-week-old plants ; Developing preanthesis pannicles ; Leaves */ if ($linelabel == "EXPRESS") {} /* PROTSIM data field Example: PROTSIM ORG=Arabidopsis thaliana; PROTGI=15221156; PROTID=ref:NP_172664.1; PCT=79.41; ALN=68 */ if ($linelabel == "PROTSIM") {} /* SCOUNT - SEQUENCE COUNT data field Example: SCOUNT 12 */ if ($linelabel == "SCOUNT") $seq_count = (int) $linedata; if ($linelabel == "//") break; } $oGene = new Gene_Unigene(); $oGene->entry_id = $entry_id; $oGene->title = $title; $oGene->seq_count = $seq_count; return $oGene; } ?>

[ Home Page ] [ I/O Scripts Page ]

 


Copyright © 2003 by Sergio Gregorio, Jr.
All rights reserved.