Google

Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members   File Members  

gnFASSource.cpp

Go to the documentation of this file.
00001 
00002 // File:            gnFASSource.h
00003 // Purpose:         Implements gnBaseSource for .FAS files
00004 // Description:     
00005 // Changes:        
00006 // Version:         libGenome 0.1.0 
00007 // Author:          Aaron Darling 
00008 // Last Edited:     April 15, 2001, 11:13:00pm 
00009 // Modified by:     
00010 // Copyright:       (c) Aaron Darling 
00011 // Licenses:        Proprietary 
00013 #include "gn/gnFASSource.h"
00014 #include "gn/gnBaseSpec.h"
00015 #include "gn/gnFilter.h"
00016 #include "gn/gnStringTools.h"
00017 #include "gn/gnSourceSpec.h"
00018 #include "gn/gnSourceHeader.h"
00019 #include "gn/gnDebug.h"
00020 
00021 gnFASSource::gnFASSource()
00022 {
00023         m_openString = "";
00024         m_pFilter = gnFilter::fullDNASeqFilter();
00025         if(m_pFilter == NULL){
00026                 DebugMsg("Error using static sequence filters.");
00027         }
00028 }
00029 gnFASSource::gnFASSource( const gnFASSource& s ) : gnFileSource(s)
00030 {
00031         vector< gnFileContig* >::const_iterator iter = s.m_contigList.begin();
00032         for( ; iter != s.m_contigList.end(); ++iter )
00033         {
00034                 m_contigList.push_back( (*iter)->Clone() );
00035         }
00036 }
00037 gnFASSource::~gnFASSource()
00038 {
00039         m_ifstream.close();
00040         vector< gnFileContig* >::iterator iter = m_contigList.begin();
00041         for( ; iter != m_contigList.end(); ++iter )
00042         {
00043                 gnFileContig* fg = *iter;
00044                 *iter = 0;
00045                 delete fg;
00046         }
00047 }
00048 boolean gnFASSource::HasContig( const string& nameStr ) const
00049 {
00050         for(uint32 i = 0 ; i <= m_contigList.size(); i++ )
00051         {
00052                 if( nameStr == m_contigList[i]->GetName() )
00053                         return true;
00054         }
00055         return false;
00056 }
00057 uint32 gnFASSource::GetContigID( const string& name ) const
00058 {
00059         for(uint32 i = 0 ; i <= m_contigList.size(); i++ )
00060         {
00061                 if( name == m_contigList[i]->GetName() )
00062                         return i;
00063         }
00064         return ALL_CONTIGS;
00065 }
00066 string gnFASSource::GetContigName( const uint32 i ) const
00067 {
00068         if( i < m_contigList.size() ){
00069                 return m_contigList[i]->GetName();
00070         }
00071         return "";
00072 }
00073 gnSeqI gnFASSource::GetContigSeqLength( const uint32 i ) const
00074 {
00075         if( i < m_contigList.size() ){
00076                 return m_contigList[i]->GetSeqLength();
00077         }else if( i == ALL_CONTIGS){
00078                 gnSeqI seqlen = 0;
00079                 for(uint32 j=0; j < m_contigList.size(); j++)
00080                         seqlen += m_contigList[j]->GetSeqLength();
00081                 return seqlen;
00082         }
00083         return GNSEQI_ERROR;
00084 }
00085 gnFileContig* gnFASSource::GetContig( const uint32 i ) const
00086 {
00087         if( i < m_contigList.size() ){
00088                 return m_contigList[i];
00089         }
00090         return 0;
00091 }
00092 
00093 boolean gnFASSource::SeqRead( const gnSeqI start, char* buf, uint32& bufLen, const uint32 contigI ) 
00094 {
00095         m_ifstream.clear();
00096         uint32 contigIndex = contigI;
00097         uint64 startPos = 0;
00098         uint64 readableBytes = 0;
00099         if( !SeqSeek( start, contigIndex, startPos, readableBytes ) ){
00100                 bufLen = 0;
00101                 return false;
00102         }
00103         
00104         if( contigI == ALL_CONTIGS ){
00105                 uint32 curLen = 0;
00106                 uint64 bytesRead = 0;
00107                 while (curLen < bufLen){
00108 //SeqSeek to start, Figure out how much can be read before SeqSeeking again.
00109                         if(readableBytes <= 0)  //Look out for zero length contigs!  IMPLEMENT ME
00110                                 if( !SeqSeek( start + curLen, contigIndex, startPos, readableBytes ) ){
00111                                         bufLen = curLen;
00112                                         return true;
00113                                 }
00114                         //readLen is the amount to read on this pass
00115                         uint64 readLen = (bufLen - curLen) < readableBytes ? (bufLen - curLen) : readableBytes; 
00116                         gnSeqC* tmpBuf = new gnSeqC[readLen];   //read into tmpBuf, then filter tmpBuf into curBuf
00117 
00118                         // read chars and filter
00119                         m_ifstream.read(tmpBuf, readLen);
00120                         uint64 gc = m_ifstream.gcount();
00121                         bytesRead += gc;
00122                         readableBytes -= gc;
00123 //                      cout << "Read " << gc << " chars from " << m_openString << "\n";
00124 //                      cout << "Checking character validity on: " << tmpBuf << "\n";
00125                         for(uint32 i=0; i < gc; i++){
00126                                 if( m_pFilter->IsValid(tmpBuf[i]) ){
00127                                         buf[curLen] = tmpBuf[i];
00128                                         curLen++;
00129                                 }
00130                         }
00131                         delete[] tmpBuf;
00132                         if(m_ifstream.eof()){   //we hit the end of the file.  bail out.
00133                                 m_ifstream.clear();
00134                                 bufLen = curLen;
00135                                 return true;
00136                         }
00137                 }
00138                 bufLen = curLen;
00139         }
00140         else if( contigI < m_contigList.size() )
00141         {
00142                 uint32 curLen = 0;
00143                 //check to see if the buffer is bigger than the contig.  if so truncate it.
00144                 gnSeqI contigSize = m_contigList[contigI]->GetSeqLength();
00145                 bufLen = bufLen < contigSize ? bufLen : contigSize;
00146                 while (curLen < bufLen)
00147                 {
00148                         uint64 readLen = bufLen - curLen;       //the amount to read on this pass
00149                         gnSeqC* tmpBuf = new gnSeqC[readLen];   //read into tmpBuf, then filter tmpBuf into curBuf
00150 
00151                         // read chars and filter
00152                         m_ifstream.read(tmpBuf, readLen);
00153                         uint64 gc = m_ifstream.gcount();
00154 //                      cout << "Read " << gc << " chars from " << m_openString << "\n";
00155 //                      cout << "Checking character validity on: " << tmpBuf << "\n";
00156                         for(uint32 i=0; i < gc; i++){
00157                                 if( m_pFilter->IsValid(tmpBuf[i]) ){
00158                                         buf[curLen] = tmpBuf[i];
00159                                         curLen++;
00160                                 }
00161                         }
00162                         if(m_ifstream.eof()){   //we hit the end of the file.  bail out.
00163                                 m_ifstream.clear();
00164                                 bufLen = curLen;
00165                                 return true;
00166                         }
00167                 }
00168                 bufLen = curLen;
00169         }
00170         return true;
00171 }
00172 
00173 
00174 // private:
00175 // figures out which contig the sequence starts at then calls SeqStartPos to get the offset within that contig
00176 // returns startPos, the file offset where the sequence starts
00177 // returns true if successful, false otherwise
00178 boolean gnFASSource::SeqSeek( const gnSeqI start, const uint32 contigI, uint64& startPos, uint64& readableBytes )
00179 {
00180         if( contigI == ALL_CONTIGS ){
00181                 // find first contig
00182                 gnSeqI curIndex = 0;
00183                 vector< gnFileContig* >::iterator iter = m_contigList.begin();
00184                 for( ; iter != m_contigList.end(); ++iter )
00185                 {
00186                         uint64 len = (*iter)->GetSeqLength();
00187                         if( (curIndex + len) > start )
00188                                 break;
00189                         curIndex += len;
00190                 }
00191                 if( iter == m_contigList.end() )
00192                         return false;
00193                 // seek to start
00194                 gnSeqI startIndex = start - curIndex;  //startIndex is starting pos. within the contig
00195                 return SeqStartPos( startIndex, *(*iter), startPos, readableBytes );
00196         }
00197         else if( contigI < m_contigList.size() )
00198         {
00199                 return SeqStartPos( start, *(m_contigList[contigI]), startPos, readableBytes );
00200         }
00201         return false;
00202 }
00203 
00204 //Returns startPos, the file offset where the sequence starts.
00205 //IMPLEMENT ME! utilize HasRepeatSeqGap to skip slow character checking process (data isn't noisy)
00206 boolean gnFASSource::SeqStartPos( const gnSeqI start, gnFileContig& contig, uint64& startPos, uint64& readableBytes ){
00207         readableBytes = 0;
00208         uint32 curLen = 0;
00209         //seek to the file offset where the contig starts
00210         startPos = contig.GetSectStartEnd(gnContigSequence).first;      //set startPos to start where the contig starts
00211 
00212         if(contig.HasRepeatSeqGap())
00213                 if(contig.GetRepeatSeqGapSize().first > 0)
00214                         if(contig.GetRepeatSeqGapSize().second > 0){
00215 //check this    
00216                                 startPos += start + (start/contig.GetRepeatSeqGapSize().first)*contig.GetRepeatSeqGapSize().second;
00217                                 readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00218                                 m_ifstream.seekg( startPos, ios::beg );
00219                                 return true;
00220                         }
00221         m_ifstream.seekg( startPos, ios::beg );
00222         if( m_ifstream.eof() ){
00223                 ErrorMsg("ERROR in gnFASSource::Incorrect contig start position, End of file reached!\n");
00224                 return false;
00225         }
00226         while( true ){
00227                   // READ the rest of the contig skipping over invalid characters until we get to the starting base pair.
00228                   // startPos will contain the file offset with the starting base pair
00229                 uint32 tmpbufsize = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00230                 if(tmpbufsize == 0){
00231                         ErrorMsg("ERROR in gnFASSource: stored contig size is incorrect.\n");
00232                         return false;
00233                 }
00234                 tmpbufsize = tmpbufsize < BUFFER_SIZE ? tmpbufsize : BUFFER_SIZE;  //read in the smaller of the two.
00235                 char *tmpbuf = new char[tmpbufsize];
00236                 m_ifstream.read( tmpbuf, tmpbufsize );
00237                 if( m_ifstream.eof() ){
00238                         ErrorMsg("ERROR in gnFASSource::Read End of file reached!\n");
00239                         delete[] tmpbuf;
00240                         return false;
00241                 }
00242                 for( uint32 i=0; i < tmpbufsize; ++i ){
00243                         if( m_pFilter->IsValid(tmpbuf[i]) ){
00244                                 if( curLen >= start ){ //stop when we reach the starting offset within this contig
00245                                         startPos += i;
00246                                         m_ifstream.seekg( startPos, ios::beg );  //seek to startPos
00247                                         readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00248                                         delete[] tmpbuf;
00249                                         return true;
00250                                 }
00251                                 ++curLen;  //each time we read a valid b.p., increment the sequence length
00252                         }
00253                 }
00254                 startPos += tmpbufsize;
00255                 delete[] tmpbuf;
00256         }
00257         return true;
00258 }
00259 
00260 //write out the given source in this file format.
00261 boolean gnFASSource::Write(gnBaseSource *source, const string& filename){
00262         ofstream m_ofstream(filename.c_str(), ios::out | ios::binary);
00263         if(!m_ofstream.is_open())
00264                 return false;
00265         uint32 contigCount = source->GetContigListLength();
00266         for(uint32 contigI = 0; contigI < contigCount; contigI++){
00267                 string contigName = source->GetContigName(contigI);
00268                 m_ofstream << ">" << contigName << ";\n";
00269                 gnSeqI seqLength = source->GetContigSeqLength(contigI);
00270                 while(seqLength > 0){
00271                         gnSeqI writeLen = seqLength < BUFFER_SIZE ? seqLength : BUFFER_SIZE;            
00272                         gnSeqC *bases = new gnSeqC[writeLen+1];
00273                         boolean success = source->SeqRead(0, bases, writeLen, contigI);
00274                         if(!success)
00275                                 return false;
00276                         bases[writeLen] = '\0';
00277                         m_ofstream << bases << "\n";
00278                         delete[] bases;
00279                         seqLength -= writeLen;
00280                 }
00281         }
00282         m_ofstream.close();
00283         return true;
00284 }
00285 
00286 //write out the given sequence in this file format.
00287 void gnFASSource::Write(gnSequence& seq, const string& filename, boolean write_coords, boolean enforce_unique_names){
00288         ofstream m_ofstream(filename.c_str(), ios::out | ios::binary);
00289         if(!m_ofstream.is_open())
00290                 Throw_gnEx(FileNotOpened());
00291         Write(seq, m_ofstream, write_coords, enforce_unique_names);
00292         m_ofstream.close();
00293 }
00294 
00295 void gnFASSource::Write(gnSequence& seq, ostream& m_ostream, boolean write_coords = true, boolean enforce_unique_names = true){
00296         vector<string> contigNameList;
00297         gnSeqC *bases = new gnSeqC[BUFFER_SIZE];
00298         gnGenomeSpec* spec = seq.GetSpec();
00299         
00300         //set the newline type.
00301         string newline = "\r\n";
00302         gnSeqI readOffset = 1;
00303 
00304         for(uint32 fragI = 0; fragI < seq.contigListLength(); fragI++){
00305                 //write out contig name and header
00306                 string contigName = seq.contigName(fragI);
00307 
00308                 if(enforce_unique_names){
00309                         uint32 name_count = 0;
00310                         for(uint32 i=0; i < contigNameList.size(); i++)
00311                                 if(contigNameList[i] == contigName)
00312                                         name_count++;
00313                         contigNameList.push_back(contigName);
00314                         if(name_count > 0)
00315                                 contigName += "_" + uintToString(name_count);
00316                 }
00317 
00318                 gnFragmentSpec* subSpec = spec->GetSpec(fragI);
00319                 gnBaseHeader *gpbh = subSpec->GetHeader(0);
00320                 string header = "";
00321                 if(gpbh != NULL){
00322                         header = gpbh->GetHeader();
00323                         //delete everything after the first newline.
00324                         uint32 newlinepos = header.find_first_of('\n', 0);
00325                         if(newlinepos != string::npos){
00326                                 if(header[newlinepos-1] == '\r')
00327                                         newlinepos--;
00328                                 header = header.substr(0, newlinepos);
00329                         }
00330                 }       //IMPLEMENT ME!! Does header line need to be < 80 chars?
00331 
00332                 //write out the sequence
00333                 gnSeqI readLength = seq.contigLength(fragI);
00334                 m_ostream << ">" << contigName;
00335                 if(write_coords)
00336                         m_ostream << " " << "(" << readOffset << ", " << readOffset + readLength - 1 << ")";
00337                 m_ostream << "; " << header << newline;
00338 
00339                 gnSeqI linePos = 0;
00340                 while(readLength > 0){  //buffer the read/writes
00341                         gnSeqI read_chunk_size = (BUFFER_SIZE / FAS_LINE_WIDTH) * FAS_LINE_WIDTH;
00342                         gnSeqI writeLen = readLength < read_chunk_size ? readLength : read_chunk_size;
00343                         
00344                         if(!seq.ToArray(bases, writeLen, readOffset))
00345                                 return;
00346                         for(gnSeqI curbase = 0; curbase < writeLen; curbase += FAS_LINE_WIDTH){
00347                                 gnSeqI writeout_size = writeLen - curbase < FAS_LINE_WIDTH ? writeLen - curbase : FAS_LINE_WIDTH;
00348                                 m_ostream << string(bases + curbase, writeout_size) << newline;
00349                         }
00350                         readLength -= writeLen;
00351                         readOffset += writeLen;
00352                 }
00353                 if(linePos != 0)
00354                         m_ostream << newline;
00355         }
00356 
00357         delete[] bases;
00358 }
00359 
00360 gnGenomeSpec *gnFASSource::GetSpec() const{
00361         gnGenomeSpec *spec = new gnGenomeSpec();
00362         for(uint32 i=0; i < m_contigList.size(); i++){
00363                 //create specs for the fragment and contig levels
00364                 gnFragmentSpec *fragmentSpec = new gnFragmentSpec();
00365                 gnSourceSpec *contigSpec = new gnSourceSpec((gnBaseSource*)this, i);
00366                 //set up the spec tree structure
00367                 spec->AddSpec(fragmentSpec, i);
00368                 fragmentSpec->AddSpec(contigSpec);
00369 
00370                 fragmentSpec->SetName(m_contigList[i]->GetName());
00371                 fragmentSpec->SetSourceName(m_openString);
00372                 contigSpec->SetName(m_contigList[i]->GetName());
00373                 contigSpec->SetSourceName(m_openString);
00374                 
00375                 pair<uint32, uint32> headsect = m_contigList[i]->GetSectStartEnd(gnContigHeader);
00376                 if(headsect.first != headsect.second){
00377                         gnSourceHeader *gpsh = new gnSourceHeader((gnBaseSource *)this, string(""), headsect.first, headsect.second - headsect.first);
00378                         fragmentSpec->AddHeader(gpsh, 0);
00379                 }
00380         }
00381         return spec;
00382 }
00383 
00384 gnFileContig* gnFASSource::GetFileContig( const uint32 contigI ) const{
00385         if(m_contigList.size() > contigI)
00386                 return m_contigList[contigI];
00387         return NULL;
00388 }
00389 
00390 boolean gnFASSource::ParseStream( istream& fin )
00391 {
00392         // INIT temp varables
00393         uint32 readState = 0;
00394         gnFileContig* currentContig = 0;
00395         string nameFStr;
00396         uint64 seqLength = 0, gapLength = 0;
00397         // INIT buffer
00398         uint64 streamPos = 0;
00399         uint64 bufReadLen = 0;
00400         char* buf = new char[BUFFER_SIZE];
00401         boolean paren_hit = false;
00402         uint32 curNewlineSize = 0;
00403         uint32 repeatSeqSize = 80;
00404 
00405         //decide what type of newlines we have
00406         
00407         fin.getline( buf, BUFFER_SIZE);
00408         fin.seekg(-2, ios::cur);
00409         fin.read( buf, 2);
00410         fin.seekg(0);
00411         if(buf[1] == '\n'){
00412                 if(buf[0] == '\r'){
00413                         m_newlineType = gnNewlineWindows;
00414                         curNewlineSize = 2;
00415                 }else{ 
00416                         curNewlineSize = 1;
00417                         if(buf[1] == '\r')
00418                                 m_newlineType = gnNewlineMac;
00419                         else
00420                                 m_newlineType = gnNewlineUnix;
00421                 }
00422         }
00423 
00424         while( !fin.eof() )
00425         {
00426                   // read chars
00427                 fin.read( buf, BUFFER_SIZE);
00428                 streamPos += bufReadLen;
00429                 bufReadLen = fin.gcount();
00430                 for( uint32 i=0 ; i < bufReadLen ; i++ )
00431                 {
00432                         char ch = buf[i];
00433                         switch( readState )
00434                         {
00435                                 case 0: // CHECK file validity
00436                                         if( !((buf[0] == '>') || !m_pFilter->IsValid(buf[0])) )
00437                                         {
00438                                                 delete[] buf;
00439                                                 return false; // NOT a .fas file
00440                                         }
00441                                         readState = 1;
00442                                 case 1: // BRANCH
00443                                         if( ch == '>' )
00444                                         {
00445                                                 seqLength = 0; gapLength = 0; // reset count
00446                                                 currentContig = new gnFileContig();
00447                                                 currentContig->SetFileStart( streamPos + i );
00448                                                 currentContig->SetRepeatSeqGap(true);
00449                                                 currentContig->SetRepeatSeqSize( repeatSeqSize );
00450                                                 currentContig->SetRepeatGapSize( curNewlineSize );
00451                                                 readState = 2;
00452                                                 paren_hit = false;
00453                                                 nameFStr = "";
00454                                         }
00455                                         else
00456                                                 ++gapLength;
00457                                         break;
00458                                 case 2: // > CONTIG NAME
00459                                         if( isNewLine(ch) || ch == ';')
00460                                         {
00461                                                 currentContig->SetName( nameFStr );
00462                                                 currentContig->SetSectStart( gnContigHeader, streamPos + i + 1 );
00463                                                 if( ch == ';' )
00464                                                         readState = 3;
00465                                                 else{
00466                                                         readState = 4;
00467                                                         if(ch == '\r')
00468                                                                 currentContig->SetSectStart( gnContigHeader, streamPos + i + 2 );
00469                                                 }
00470                                         }
00471                                         else if( ch == '(' ){
00472                                                 if(isSpace(buf[i-1])){
00473                                                         //delete the last char in nameFStr
00474                                                         nameFStr = nameFStr.substr(0, nameFStr.length()-1);
00475                                                 }
00476                                                 paren_hit = true;
00477                                         }else if((!isSpace(ch) || nameFStr.length()) && !paren_hit )
00478                                                 nameFStr += ch;
00479                                         break;
00480                                 case 3: // >... ; REMARK
00481                                         if( isNewLine(ch) )
00482                                                 readState = 4;
00483                                         break;
00484                                 case 4: // >... ; /n NEWLINE BRANCH
00485                                         if( ch == '>' )
00486                                         {
00487                                                 readState = 3;
00488                                         }
00489                                         else if( m_pFilter->IsValid(ch) )
00490                                         {
00491                                                 currentContig->SetSectEnd( gnContigHeader, streamPos + i );
00492                                                 currentContig->SetSectStart( gnContigSequence, streamPos + i);
00493                                                 seqLength = 1; gapLength = 0;
00494                                                 readState = 5;
00495                                         }
00496                                         break;
00497                                 case 5: // SEQUENCE
00498                                         while(i < bufReadLen){
00499                                                 ch = buf[i];
00500                                                 if( m_pFilter->IsValid(ch) ){
00501                                                         if(gapLength > 0){
00502                                                                 if(seqLength != repeatSeqSize)
00503                                                                         currentContig->SetRepeatSeqGap(false);
00504                                                                 if(gapLength != curNewlineSize)
00505                                                                         //file is corrupt, can't do jumps.
00506                                                                         currentContig->SetRepeatSeqGap(false);
00507                                                                 currentContig->AddToSeqLength( seqLength );
00508                                                                 seqLength = 0;
00509                                                                 gapLength = 0;
00510                                                         }
00511                                                         seqLength++;
00512                                                 }else if( ch == '>'){
00513                                                         currentContig->AddToSeqLength( seqLength );
00514                                                         currentContig->SetSectEnd( gnContigSequence, streamPos + i - 1);
00515                                                         currentContig->SetFileEnd( streamPos + i - 1 );
00516                                                         m_contigList.push_back(currentContig);
00517                                                         readState = 1;
00518                                                         i--;
00519                                                         break;
00520                                                 }
00521                                                 else if( isNewLine(ch) ){
00522                                                         gapLength++;
00523                                                 }
00524                                                 else{
00525                                                         // Error cannot do nice jumps
00526                                                         currentContig->SetRepeatSeqGap(false);
00527                                                 }
00528                                                 i++;
00529                                         }
00530                                         break;
00531                                 default:
00532                                         ErrorMsg("ERROR");
00533                                         delete[] buf;
00534                                         return false;
00535                                         break;
00536                         }
00537                 }// for all buf
00538         }// while !eof
00539         // CLEAN UP
00540         if( currentContig != 0 )
00541         {
00542                 if( readState == 2 )
00543                 {
00544                         currentContig->SetName( nameFStr );
00545                 }
00546                 if( (readState >= 2) && (readState < 5) )
00547                 {
00548                         currentContig->SetSectEnd( gnContigHeader, streamPos + bufReadLen );
00549                 }
00550                 else if( readState ==  5 )
00551                 {
00552                         currentContig->AddToSeqLength( seqLength );
00553                         currentContig->SetSectEnd( gnContigSequence, streamPos + bufReadLen );
00554                 }                       
00555                 currentContig->SetFileEnd( streamPos + bufReadLen );
00556                 m_contigList.push_back(currentContig);
00557         }
00558         m_ifstream.clear();
00559         delete[] buf;
00560         return true;
00561 }

Generated at Fri Nov 30 15:36:51 2001 for libGenome by doxygen1.2.8.1 written by Dimitri van Heesch, © 1997-2001