#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# This file is part of G-language Genome Analysis Environment package
#
#     Copyright (C) 2001-2008 Keio University
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# 
#   $Id: Primitive.pm,v 1.1 2002/07/30 17:44:27 gaou Exp $
#
# G-language GAE is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
# 
# G-language GAE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public
# License along with G-language GAE -- see the file COPYING.
# If not, write to the Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# 
#END_HEADER
#

package G::Seq::Primitive;

use strict;
use base qw(Exporter);
use autouse 'Algorithm::Numerical::Shuffle'=>qw(shuffle);
use SelfLoader;

use SubOpt;
use G::Messenger;

our @EXPORT = qw(
	     complement
	     translate
	     splitprintseq
	     to_fasta
	     shuffleseq
	     DoubleHelix
);



=head1 NAME

  G::Seq::Primitivew - Basic sequence analysis methods

=head1 DESCRIPTION

           This class is a part of G-language Genome Analysis Environment,
           collecting basic sequence analysis methods.

=cut



#::::::::::::::::::::::::::::::
#          Constants
#::::::::::::::::::::::::::::::

my %CodonTable = (
	       'gac', 'D', 'caa', 'Q', 'gca', 'A', 'ctg', 'L',
	       'gat', 'D', 'cag', 'Q', 'gcc', 'A', 'ctt', 'L',
	       'gaa', 'E', 'agc', 'S', 'gcg', 'A', 'ata', 'I',
	       'gag', 'E', 'agt', 'S', 'gct', 'A', 'atc', 'I',
	       'aga', 'R', 'tca', 'S', 'gga', 'G', 'att', 'I',
	       'agg', 'R', 'tcc', 'S', 'ggc', 'G', 'cca', 'P',
	       'cga', 'R', 'tcg', 'S', 'ggg', 'G', 'ccc', 'P',
	       'cgc', 'R', 'tct', 'S', 'ggt', 'G', 'ccg', 'P',
	       'cgg', 'R', 'aca', 'T', 'gta', 'V', 'cct', 'P',
	       'cgt', 'R', 'acc', 'T', 'gtc', 'V', 'atg', 'M',
	       'aaa', 'K', 'acg', 'T', 'gtg', 'V', 'tgg', 'W',
	       'aag', 'K', 'act', 'T', 'gtt', 'V', 'tgc', 'C',
	       'cac', 'H', 'tac', 'Y', 'tta', 'L', 'tgt', 'C',
	       'cat', 'H', 'tat', 'Y', 'ttg', 'L', 'taa', '/',
	       'aac', 'N', 'ttc', 'F', 'cta', 'L', 'tag', '/',
	       'aat', 'N', 'ttt', 'F', 'ctc', 'L', 'tga', '/'
		  );


#::::::::::::::::::::::::::::::
#    Let the code begin...
#::::::::::::::::::::::::::::::


sub translate {
    my $seq = lc(shift);
    my $amino = '';

    while(3 <= length($seq)){
	my $codon = substr($seq, 0, 3);
	substr($seq, 0, 3) = '';
	if ($codon =~ /[^atgc]/){
	    $amino .= 'X';
	}else{
	    $amino .= $CodonTable{$codon};
	}
    }

    msg_error("Translation: illegal length.\n") if(length($seq));

    return $amino;
}


__DATA__

sub complement {
    my $nuc = reverse(shift);

    $nuc =~ tr
	[acgturymkdhbvwsnACGTURYMKDHBVWSN]
	[tgcaayrkmhdvbwsnTGCAAYRKMHDVBWSN];

    return $nuc;
}


=head2 splitprintseq

  Name: splitprintseq   -   format sequence data for printing

  Description:
    This method splits the given sequence string in segments of 
    certain length and inserts newline code ("\n") to print the
    sequence in a formatted way. By default, this function splits
    the string in segments of 60 letters. This length can be
    changed by supplying the second argument.

    
  Usage : 
    print splitprintseq($seq); # print sequence in a formatted way.
      or
    $formatted_seq = splitprintseq($seq, 100);  
    # split into segments of 100 characters and add "\n" to each line.

 Options:
    Length of segments can be specified as the second argument.
    Default is 60 characters to match GenBank.

  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20050116-01 initial posting

=cut


sub splitprintseq {
    my $seq = shift;
    my $len = shift || 60;
    my $ret = '';

    while(length $seq){
	$ret .= substr($seq, 0, $len) . "\n";
	substr($seq, 0, $len) = '';
    }
    
    return $ret;
}


=head2 to_fasta

  Name: to_fasta   -   output given sequence to a FASTA file

  Description:
    This method outputs the given sequence as a FASTA file.
    
  Usage : 
    to_fasta($seq, -name=>"My sequence"); # output the sequence to out.fasta
      or
    $fasta_string = to_fasta($gb, -output=>"return");

 Options:
    -name        string for FASTA header (default: sequence)
                 if the first argument is an instance of new G(), 
                 $gb->{LOCUS}->{id} is used as default
    -length      number of characters per line (default: 60)
    -filename    output filename (default: "out.fasta")
    -output      "f" to output to file, and return the filename.
                 "return" to return the fasta as a string

  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20070612-01 added -output option
    20050116-01 initial posting

=cut

sub to_fasta{
    SubOpt::opt_default(length=>60, filename=>"out.fasta", name=>"sequence", output=>"f");
    my @args = SubOpt::opt_get(@_);
    my $gb = SubOpt::opt_as_gb(shift @args);
    my $filename = SubOpt::opt_val("filename");
    my $name = SubOpt::opt_val("name");
    my $length = SubOpt::opt_val("length");
    my $output = SubOpt::opt_val("output");

    $name = $gb->{LOCUS}->{id} if($name eq 'sequence' && length $gb->{LOCUS}->{id});

    if($output eq 'f'){
	open(OUT, ">$filename") || die($!);
	printf OUT ">%s\n%s", $name, splitprintseq($gb->{SEQ}, $length);
	close(OUT);
	return $filename;
    }else{
	return sprintf ">%s\n%s", $name, splitprintseq($gb->{SEQ}, $length);
    }
}




=head2 shuffleseq

  Name: shuffleseq   -   create randomized sequence with conserved composition

  Description:
    Shuffle and randomize the given sequence, conserving the nucleotide/peptide
    content of the original sequence. Fisher-Yates Algorithm is used for shuffling.
    
  Usage:
    $shuffled_seq = shuffleseq($gb);

  Options:
    None.

  References:
    1. Fisher R.A. and Yates F. (1938) "Example 12", Statistical Tables, London
    2. Durstenfeld R. (1964) "Algorithm 235: Random permutation", CACM 7(7):420
  
  Author:
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20070612-01 initial posting

=cut


sub shuffleseq{
    my @args = SubOpt::opt_get(@_);
    my $gb = SubOpt::opt_as_gb(shift @args);

    return join('', shuffle(split(//, $gb->{SEQ})));
}




=head2 DoubleHelix

  Name: DoubleHelix   -   we all love the DNA molecule!

  Description:
    This method prints the given sequence as an ASCII text-based graphic
    depicting a DNA molecule. Enjoy :-)
    
  Usage: 
    DoubleHelix($seq);

  Options:
    -speed      controls the time to display one base-pair.
                default is 0.05 (second)

  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20060511-01 initial posting

=cut

sub DoubleHelix{
    SubOpt::opt_default(speed=>0.05);
    my @args = SubOpt::opt_get(@_);  
    my $gb = SubOpt::opt_as_gb(shift @args);
    my $speed = SubOpt::opt_val("speed");

    $| = 1;

    my $i;
    my (@offset) = qw/1 0 0 0 1 2 3 4 5 5 4 3 2 1 0 0 0 1/;
    my (@dist)   = qw/0 2 3 4 4 4 3 2 0 0 2 3 4 4 4 3 2 0/;

    foreach my $base (split(//, $gb->{SEQ})){
        print "         ", q/ /x$offset[($i%scalar(@offset))];
	print uc($base);
	$i ++;
	print q/-/x$dist[($i%scalar(@dist))];
	print uc(complement($base)), "\n";
	select(undef,undef,undef,$speed);
    }
}




1;



