Class Bio::FlatFile::AutoDetect
In: lib/bio/io/flatfile.rb  (CVS)
Parent: Object

AutoDetect automatically determines database class of given data.

Methods

Included Modules

TSort

Classes and Modules

Class Bio::FlatFile::AutoDetect::RuleDebug
Class Bio::FlatFile::AutoDetect::RuleProc
Class Bio::FlatFile::AutoDetect::RuleRegexp
Class Bio::FlatFile::AutoDetect::RuleRegexp2
Class Bio::FlatFile::AutoDetect::RuleSpecial
Class Bio::FlatFile::AutoDetect::RuleTemplate
Class Bio::FlatFile::AutoDetect::RulesArray

Constants

TopRule = RuleSpecial.new('top')   Special element that is always top priority.
BottomRule = RuleSpecial.new('bottom')   Special element that is always bottom priority.

Public Class methods

make a new autodetect object

[Source]

# File lib/bio/io/flatfile.rb, line 1119
      def self.[](*arg)
        a = self.new
        arg.each { |e| a.add(e) }
        a
      end

returns the default autodetect object

[Source]

# File lib/bio/io/flatfile.rb, line 1106
      def self.default
        unless @default then
          @default = self.make_default
        end
        @default
      end

sets the default autodetect object.

[Source]

# File lib/bio/io/flatfile.rb, line 1114
      def self.default=(ad)
        @default = ad
      end

make a default of default autodetect object

[Source]

# File lib/bio/io/flatfile.rb, line 1126
      def self.make_default
        a = self[
          genbank  = RuleRegexp[ 'Bio::GenBank',
            /^LOCUS       .+ bp .*[a-z]*[DR]?NA/ ],
          genpept  = RuleRegexp[ 'Bio::GenPept',
            /^LOCUS       .+ aa .+/ ],
          medline  = RuleRegexp[ 'Bio::MEDLINE',
            /^PMID\- [0-9]+$/ ],
          embl     = RuleRegexp[ 'Bio::EMBL',
            /^ID   .+\; .*(DNA|RNA|XXX)\;/ ],
          sptr     = RuleRegexp2[ 'Bio::SPTR',
            /^ID   .+\; *PRT\;/,
            /^ID   [-A-Za-z0-9_\.]+ .+\; *[0-9]+ *AA\./ ],
          prosite  = RuleRegexp[ 'Bio::PROSITE',
            /^ID   [-A-Za-z0-9_\.]+\; (PATTERN|RULE|MATRIX)\.$/ ],
          transfac = RuleRegexp[ 'Bio::TRANSFAC',
            /^AC  [-A-Za-z0-9_\.]+$/ ],

          aaindex  = RuleProc.new('Bio::AAindex1', 'Bio::AAindex2') do |text|
            if /^H [-A-Z0-9_\.]+$/ =~ text then
              if text =~ /^M [rc]/ then
                Bio::AAindex2
              elsif text =~ /^I    A\/L/ then
                Bio::AAindex1
              else
                false #fail to determine
              end
            else
              nil
            end
          end,

          litdb    = RuleRegexp[ 'Bio::LITDB',
            /^CODE        [0-9]+$/ ],
          brite    = RuleRegexp[ 'Bio::KEGG::BRITE',
            /^Entry           [A-Z0-9]+/ ],
          orthology = RuleRegexp[ 'Bio::KEGG::ORTHOLOGY',
            /^ENTRY       .+ KO\s*/ ],
          drug     = RuleRegexp[ 'Bio::KEGG::DRUG',
            /^ENTRY       .+ Drug\s*/ ],
          glycan   = RuleRegexp[ 'Bio::KEGG::GLYCAN',
            /^ENTRY       .+ Glycan\s*/ ],
          enzyme   = RuleRegexp2[ 'Bio::KEGG::ENZYME',
            /^ENTRY       EC [0-9\.]+$/,
            /^ENTRY       .+ Enzyme\s*/
          ],
          compound = RuleRegexp2[ 'Bio::KEGG::COMPOUND',
            /^ENTRY       C[A-Za-z0-9\._]+$/,
            /^ENTRY       .+ Compound\s*/
          ],
          reaction = RuleRegexp2[ 'Bio::KEGG::REACTION',
            /^ENTRY       R[A-Za-z0-9\._]+$/,
            /^ENTRY       .+ Reaction\s*/
          ],
          genes    = RuleRegexp[ 'Bio::KEGG::GENES',
            /^ENTRY       .+ (CDS|gene|.*RNA|Contig) / ],
          genome   = RuleRegexp[ 'Bio::KEGG::GENOME',
            /^ENTRY       [a-z]+$/ ],

          fantom = RuleProc.new('Bio::FANTOM::MaXML::Cluster',
                                'Bio::FANTOM::MaXML::Sequence') do |text|
            if /\<\!DOCTYPE\s+maxml\-(sequences|clusters)\s+SYSTEM/ =~ text
              case $1
              when 'clusters'
                Bio::FANTOM::MaXML::Cluster
              when 'sequences'
                Bio::FANTOM::MaXML::Sequence
              else
                nil #unknown
              end
            else
              nil
            end
          end,

          pdb = RuleRegexp[ 'Bio::PDB',
            /^HEADER    .{40}\d\d\-[A-Z]{3}\-\d\d   [0-9A-Z]{4}/ ],
          het = RuleRegexp[ 'Bio::PDB::ChemicalComponent',
            /^RESIDUE +.+ +\d+\s*$/ ],

          clustal = RuleRegexp2[ 'Bio::ClustalW::Report',
          /^CLUSTAL .*\(.*\).*sequence +alignment/,
          /^CLUSTAL FORMAT for T-COFFEE/ ],

          gcg_msf = RuleRegexp[ 'Bio::GCG::Msf',
          /^!!(N|A)A_MULTIPLE_ALIGNMENT .+/ ],

          gcg_seq = RuleRegexp[ 'Bio::GCG::Seq',
          /^!!(N|A)A_SEQUENCE .+/ ],

          blastxml = RuleRegexp[ 'Bio::Blast::Report',
            /\<\!DOCTYPE BlastOutput PUBLIC / ],
          wublast  = RuleRegexp[ 'Bio::Blast::WU::Report',
            /^BLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/ ],
          wutblast = RuleRegexp[ 'Bio::Blast::WU::Report_TBlast',
            /^TBLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/ ],
          blast    = RuleRegexp[ 'Bio::Blast::Default::Report',
            /^BLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ],
          tblast   = RuleRegexp[ 'Bio::Blast::Default::Report_TBlast',
            /^TBLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ],

          blat   = RuleRegexp[ 'Bio::Blat::Report',
            /^psLayout version \d+/ ],
          spidey = RuleRegexp[ 'Bio::Spidey::Report',
            /^\-\-SPIDEY version .+\-\-$/ ],
          hmmer  = RuleRegexp[ 'Bio::HMMER::Report',
            /^HMMER +\d+\./ ],
          sim4   = RuleRegexp[ 'Bio::Sim4::Report',
            /^seq1 \= .*\, \d+ bp(\r|\r?\n)seq2 \= .*\, \d+ bp(\r|\r?\n)/ ],

          fastaformat = RuleProc.new('Bio::FastaFormat',
                                     'Bio::NBRF',
                                     'Bio::FastaNumericFormat') do |text|
            if /^>.+$/ =~ text
              case text
              when /^>([PF]1|[DR][LC]|N[13]|XX)\;.+/
                Bio::NBRF
              when /^>.+$\s+(^\#.*$\s*)*^\s*\d*\s*[-a-zA-Z_\.\[\]\(\)\*\+\$]+/
                  Bio::FastaFormat
              when /^>.+$\s+^\s*\d+(\s+\d+)*\s*$/
                Bio::FastaNumericFormat
              else
                false
              end
            else
              nil
            end
          end
        ]

        # dependencies
        # NCBI
        genbank.is_prior_to genpept
        # EMBL/UniProt
        embl.is_prior_to sptr
        sptr.is_prior_to prosite
        prosite.is_prior_to transfac
        # KEGG
        #aaindex.is_prior_to litdb
        #litdb.is_prior_to brite
        brite.is_prior_to orthology
        orthology.is_prior_to drug
        drug.is_prior_to glycan
        glycan.is_prior_to enzyme
        enzyme.is_prior_to compound
        compound.is_prior_to reaction
        reaction.is_prior_to genes
        genes.is_prior_to genome
        # PDB
        pdb.is_prior_to het
        # BLAST
        wublast.is_prior_to wutblast
        wutblast.is_prior_to blast
        blast.is_prior_to tblast
        # FastaFormat
        BottomRule.is_prior_to(fastaformat)

        # for debug
        #debug_first = RuleDebug.new('debug_first')
        #a.add(debug_first)
        #debug_first.is_prior_to(TopRule)

        ## for debug
        #debug_last = RuleDebug.new('debug_last')
        #a.add(debug_last)
        #BottomRule.is_prior_to(debug_last)
        #fastaformat.is_prior_to(debug_last)

        a.rehash
        return a
      end

Creates a new Autodetect object

[Source]

# File lib/bio/io/flatfile.rb, line 984
      def initialize
        # stores autodetection rules.
        @rules = Hash.new
        # stores elements (cache)
        @elements = nil
        self.add(TopRule)
        self.add(BottomRule)
      end

Public Instance methods

Adds a new element. Returns elem.

[Source]

# File lib/bio/io/flatfile.rb, line 995
      def add(elem)
        raise 'element name conflicts' if @rules[elem.name]
        @elements = nil
        @rules[elem.name] = elem
        elem
      end

Autodetect from the text. Returns a database class if succeeded. Returns nil if failed.

[Source]

# File lib/bio/io/flatfile.rb, line 1063
      def autodetect(text, meta = {})
        r = nil
        elements.each do |e|
          #$stderr.puts e.name
          r = e.guess(text, meta)
          break if r
        end
        r
      end

autodetect from the FlatFile object. Returns a database class if succeeded. Returns nil if failed.

[Source]

# File lib/bio/io/flatfile.rb, line 1076
      def autodetect_flatfile(ff, lines = 31)
        meta = {}
        stream = ff.instance_eval { @stream }
        begin
          path = stream.path
        rescue NameError
        end
        if path then
          meta[:path] = path
          # call autodetect onece with meta and without any read action
          if r = self.autodetect(stream.prefetch_buffer, meta)
            return r
          end
        end
        # reading stream
        1.upto(lines) do |x|
          break unless line = stream.prefetch_gets
          if line.strip.size > 0 then
            if r = self.autodetect(stream.prefetch_buffer, meta)
              return r
            end
          end
        end
        return nil
      end

Iterates over each element.

[Source]

# File lib/bio/io/flatfile.rb, line 1056
      def each_rule(&x) #:yields: elem
        elements.each(&x)
      end

Returns current elements as an array whose order fulfills all elements’ priorities.

[Source]

# File lib/bio/io/flatfile.rb, line 1033
      def elements
        unless @elements
          ary = tsort
          ary.reverse!
          @elements = ary
        end
        @elements
      end

visualizes the object (mainly for debug)

[Source]

# File lib/bio/io/flatfile.rb, line 1049
      def inspect
        "<#{self.class.to_s} " +
          self.elements.collect { |e| e.name.inspect }.join(' ') +
          ">"
      end

rebuilds the object and clears internal cache.

[Source]

# File lib/bio/io/flatfile.rb, line 1043
      def rehash
        @rules.rehash
        @elements = nil
      end

(required by TSort.) For a given element, yields each child (= lower priority elements) of the element.

[Source]

# File lib/bio/io/flatfile.rb, line 1011
      def tsort_each_child(elem)
        if elem == TopRule then
          @rules.each_value do |e|
            yield e unless e == TopRule or 
              e.lower_priority_elements.index(TopRule)
          end
        elsif elem == BottomRule then
          @rules.each_value do |e|
            yield e if e.higher_priority_elements.index(BottomRule)
          end
        else
          elem.lower_priority_elements.each do |e|
            yield e if e != BottomRule
          end
          unless elem.higher_priority_elements.index(BottomRule)
            yield BottomRule
          end
        end
      end

(required by TSort.) For all elements, yields each element.

[Source]

# File lib/bio/io/flatfile.rb, line 1004
      def tsort_each_node(&x)
        @rules.each_value(&x)
      end

[Validate]