#!/bin/bash

#
# aop.sh -- Shell wrapper for the POA (version 2) multiple sequence
#   alignment program (see http://www.bioinformatics.ucla.edu/poa for
#   information on POA)
#
# (C) 2007 by
#     Markus Goeker (support [AT] goeker [DOT] org)
#
# This program is distributed under the terms of the Gnu Public License V2.
# For further information, see http://www.gnu.org/licenses/gpl.html
#
# If you happen to use this script in a publication, please cite the web
# page at http://www.goeker.org/mg/aop/
#

########################################################################

# Change this to other default locations:
declare -r POADEF="/usr/local/bin/poa"
declare -r FORMATDB="/usr/bin/formatdb"
declare -r BLASTALL="/usr/bin/blastall"

########################################################################

# Chat Ramey's recommendations for secure shell scripts
IFS=$' \t\n'
unset -f unalias
\unalias -a
unset -f command
SYSPATH="$(command -p getconf PATH 2>/dev/null)"
if [[ -z "$SYSPATH" ]] ; then
  SYSPATH="/usr/bin:/bin"
fi
PATH="$SYSPATH:$PATH"

########################################################################

# Check whether file has legible contents or is executable
function file_ok
{
  local checkexec=
  if [ "$1" = "-x" ]; then
    checkexec=yes
    shift
  fi
  if [ -z $checkexec ]; then
    for i; do
      if ! [ -r "$i" ] || ! [ -f "$i" ] || ! [ -s "$i" ]; then
        echo "Can't read contents of file \"$i\"." >&2
        return 1
      fi
    done
  else
    for i; do
      if ! [ -x "$i" ] || ! [ -f "$i" ]; then
        echo "Can't execute file \"$i\"." >&2
        return 1
      fi
    done
  fi
}

########################################################################

# Remove directory name and file extension from a file name
function trunk_name
{
  if [ $# -ne 1 ]; then
    echo "Wrong number of arguments for function $FUNCNAME." >&2
    exit 1
  fi
  local outname="${1##*/}"
  echo "${outname%.*}"
}

########################################################################

set -eu

adblast=
clean=
convert="-tolower" # Default is nucleotide sequences
delete=
doblast=
filterfile=
helpmsg=
isgbk=
keeptmp=
matrix=
msafile=
poa="$POADEF"
printmat=
scorefile=
taxonomy=
declare -i woco=0

while getopts "a:bBcdf:ghkl:m:pqs:w:x" opt; do
  case $opt in
    a ) msafile="$OPTARG";;
    b ) doblast=yes;;
    B )
        doblast=yes
        adblast=yes
    ;;
    c ) clean=yes;;
    d ) delete=yes;;
    f ) filterfile="$OPTARG";;
    g ) isgbk=yes;;
    h ) helpmsg=yes;;
    k ) keeptmp=yes;;
    l ) poa="$OPTARG";;
    m )
        matrix="$OPTARG"
        file_ok "$matrix" || exit 1
    ;;
    p ) convert="-toupper";;
    q ) printmat=yes;;
    s )
        scorefile="$OPTARG"
        file_ok "$scorefile" || exit 1
    ;;
    w )
        woco="$OPTARG"
        if [ $woco -lt 2 ]; then
          echo "Invalid -w argument \"$OPTARG\"." >&2
          exit 1
        fi
    ;;
    x ) taxonomy=yes;;
    * ) exit 1;;
  esac
done

shift $(($OPTIND-1))

########################################################################

if [ "$clean" -a "$isgbk" ] ||
  [ $woco -gt 0 -a "$taxonomy" ] ||
  [ $woco -gt 0  -a "$doblast" ] ||
  [ "$taxonomy" -a "$doblast" ] ||
  [ "$taxonomy" -a -z "$isgbk" ] ||
  [ "$keeptmp" -a -z "$clean" -a -z "$isgbk" ] ||
  [ "$keeptmp" -a "$delete" ] ||
  [ "$filterfile" -a -z "$msafile" -a -z "$isgbk" ] ||
  [ "$printmat" -a "$clean" ] ||
  [ "$printmat" -a "$isgbk" ] ||
  [ "$printmat" -a $woco -gt 0 ]; then
  echo "Incompatible options entered." >&2
  exit 1
fi

if [ $helpmsg ] || [ $# -ne 1 ] && [ -z "$filterfile" ] && [ -z $printmat ]; then
  cat >&2 <<-__EOF

	${0##*/} is a Shell wrapper for the POA alignment program.

	Usage: ${0##*/} [options] FASTA_file

	Default location for the POA executable is "$POADEF". Further
	default behaviour is to create an intermediary BLOSUM matrix file.
	To use the -b or -B option, the NCBI blast tools blastall and
	formatdb must be installed.

	Available options:
	  
	  -a file    Read profile alignment from file. See also -f.
	  -b         Compute pair-wise similarity scores with BLAST and
	             emulate make_pscores.pl as distributed with POA.
	  -B         Compute pair-wise similarity scores with BLAST but
	             use a more elaborate similarity function.
	  -c         Pre-clean the FASTA file to improve POA output; use
	             this if input files contains Macintosh linebreaks
	             or sequence gaps. See also -k.
	  -d         Output only aligned sequences in FASTA format.
	  -f file    Read sequence names (one per line) from file and
	             remove these sequences from -a or -g input. In the
	             latter case, acessions numbers are expected.
	  -g         Convert Genbank to FASTA before running POA.
	  -h         Print this help message.
	  -k         Keep FASTA file created by -c or -g.
	  -l file    Use alternative POA executable in file.
	  -m file    Use alternative substitution matrix in file.
	  -p         Force treatment as amino acid sequences.
	  -q         Print default BLOSUM matrix file to standard output.
	  -s file    Read pair-wise similarity scores from file.
	  -w integer Provide word length to compute pair-wise similarity
	             scores using the Bray-Curtis function and word counts.
	  -x         Extract taxonomical similarity scores from Genbank
	             file. To be used in connection with -g.

	This script was written by Markus Goeker.
	Contact: <support@goeker.org>

__EOF
  exit 1
fi

########################################################################

[ $# -ne 0 ] && infile="$1" || infile=

for i in "$infile" "$msafile" "$filterfile"; do
  [ "$i" ] && ! file_ok "$i" && exit 1
done

file_ok -x "$poa" || exit 1

########################################################################

if [ "$filterfile" ] && [ -z $isgbk ]; then # Filter for sequences in MSA file
  tmpfilter="$(trunk_name "$0")_filter_$$.tmp"
  if ! awk 'NF {print "SOURCENAME=" $1}' "$filterfile" > "$tmpfilter" 2> /dev/null; then
    echo "Parsing $filterfile did not result." >&2
    echo "Examine format of $filterfile and/or your awk implementation." >&2
    rm -f "$tmpfilter"
    exit 1
  fi
else
  tmpfilter=
fi 

########################################################################

if [ $clean ]; then # Clean FASTA input
  origfile="$infile" # Exchange names
  infile="$(trunk_name "$infile").cfa"
  echo "Cleaning FASTA input..."
  echo
  if ! awk -v RS="\n|\r\n|\r" '
      /^>/ {
        print ">seq" (++count) " " substr($0,2)
       }
      ! /^>/ {
        gsub(/[^A-Za-z]+/,"")
        print
      }
    ' "$origfile" > "$infile" 2> /dev/null
  then
    echo "Cleaning $origfile did not result." >&2
    echo "Examine format of $origfile and/or your awk implementation." >&2
    exit 1
  fi
fi

########################################################################

if [ $isgbk ]; then # Genbank flatfile has to be converted
  awkscript='
    BEGIN {
      OFS="\t"
      if (ARGC!=2)
        exit 1
      # Read accessions of sequences to be omitted
      if (filterfile) {
        print "Reading file " filterfile "..." > "/dev/stderr"
        while ((getline<filterfile)>0) {
          accession=$1
          gsub(/[^A-Za-z0-9]+/,"_",accession)
          haveacc[accession]++
        }
      }
    }
    # Read accession number
    /^ *ACCESSION +/ {
      accession=$2
      gsub(/[^A-Za-z0-9]+/,"_",accession)
      if (! (accession in haveacc)) {
        musthave=1
        haveacc[accession]++
      } else
        print "Omitting sequence " accession "." > "/dev/stderr"
    }
    # Read taxonomical data
    /^ *ORGANISM +[^ ]+/ && musthave {
      termtax=substr($0,index($0,$2))
      if (simfile) {
        while (getline && $0!~/^ *(REFERENCE|FEATURES) +/)
          taxonomy=(taxonomy $0)
        taxonomy=(taxonomy ";" termtax)
        gsub(/^;+|;+$/,"",taxonomy)
        gsub(/;+/,";",taxonomy)
        np=split(taxonomy,part,";")
        terminal[++taxcount]=accession
        for (i=1;i<=np;i++)
          parents[taxcount,i]=part[i]
        for (i=1;i<taxcount;i++)
          print terminal[i], accession,
            taxsim(parents,i,taxcount) > simfile
        taxonomy=""
      }
      gsub(/[ \t\v\r\n\b\a\f]+/,"_",termtax)
      print ">" accession " " termtax "_" accession
      termtax=accession=""
    }
    # Read and print sequences
    /^ *ORIGIN/,/^ *\/\// {
      if ($0~/^ +[0-9]+ +/ && musthave) {
        gsub(/[^A-Za-z]+/,"")
        print
      } else if ($0~/^ *\/\//)
        musthave=0
    }
    # Compute taxonomical similarity
    function taxsim(taxarr,num1,num2,  i,sim) {
      for (i=1;(num1,i) in taxarr;i++) {
        if (taxarr[num1,i]!=taxarr[num2,i] || ! ((num2,i) in taxarr))
          break
        sim++
      }
      return sim
    }
  '
  gbkfile="$infile" # Change names
  infile="$(trunk_name "$gbkfile").fas"
  echo "Converting Genbank file..."
  echo
  if [ "$taxonomy" ]; then
    scorefile="$(trunk_name "$gbkfile")_$(trunk_name "$0").sim"
    echo "Extracting taxonomic information..."
    echo
  fi
  if ! awk -v simfile="$scorefile" -v filterfile="$filterfile" "$awkscript" \
    "$gbkfile" > "$infile"
  then
    echo "Conversion of Genbank format did not result." >&2
    echo "Examine input file or your awk implementation." >&2
    exit 1
  fi
fi

########################################################################

if [ $doblast ]; then # Compute similarity scores with BLAST
  file_ok -x "$FORMATDB" "$BLASTALL" || exit 1
  scorefile="$(trunk_name "$infile")_$(trunk_name "$0").sim"
  blasttmpname="$(trunk_name "$0")_blast_$$"
  if [ "$convert" = "-tolower" ]; then # Nucleotides
    blastparam1=F
    blastparam2="-p blastn"
  else # Amino acids
    blastparam1=T
    blastparam2="-p blastp -M BLOSUM80"
  fi
  echo "Computing pair-wise BLAST similarity scores..."
  echo
  if "$FORMATDB" -i "$infile" -p $blastparam1 -n "$blasttmpname" -l /dev/null; then
    if ! "$BLASTALL" $blastparam2 -d "$blasttmpname" -i "$infile" |
      awk -v adblast="$adblast" '
        BEGIN { OFS="\t" }
        $1~/^Query=/ { seq1=$2 }
        /\(bits\)/,/^>/ {
          if (NF==3)
            if (adblast) {
              name[$1]++
              sim[seq1,$1]=$2
            } else
              print seq1, $1, $2 ".0"
        }
        END { if (adblast) {
          for (i in name) {
            selfsim1=sim[i,i]
            for (j in name)
              print i, j, (sim[i,j]+sim[j,i])/(selfsim1+sim[j,j])*10^6
          }
        }}
      ' - > "$scorefile" || ! file_ok "$scorefile"
    then
      echo "Running $BLASTALL or reading its output did not result." >&2
      rm -f "$blasttmpname".{n,p}{hr,in,sq}
      exit 1
    fi
    rm -f "$blasttmpname".{n,p}{hr,in,sq}
  else
    echo "Running $FORMATDB did not result." >&2
    exit 1
  fi
fi

########################################################################

if [ $woco -gt 0 ]; then # Compute similarity scores from sequence word counts
  awkscript='
    BEGIN {
      wordlength+=0
      if (wordlength<2) {
        print ARGV[0] ": Using default word length of 2." > "/dev/stderr"
        Errorcheck=1
        exit 1
      }
      OFS="\t"
    }
    # Compute similarities and get label
    /^>/ {
      master_woco(wordlength,seqcount,curseq,seqlab,protein)
      curseq=""
      thislab=substr(substr($0,2),1,31)
      sub(/[ \t].*$/,"",thislab)
      seqlab[++seqcount]=thislab
    }
    # Get sequence
    ! /^>/ { curseq=(curseq $0) }
    # Compute similarities of last sequence
    END {
      if (! Errorcheck)
        master_woco(wordlength,seqcount,curseq,seqlab,protein)
    }
    # Manages counting words and printing similarities
    function master_woco(wordlen,scount,curseq,slabel,isprot, \
      curlab,i) {
      curseq=toupper(curseq)
      if (isprot)
        gsub(/[^*ACDEFGHIKLMNPQRSTVWY]/,"",curseq)
      else
        gsub(/[^ACGT]/,"",curseq)
      if (curseq) {
        countwords(wordlen,curseq,scount,presence,wordcount)
        curlab=slabel[scount]
        for (i=1;i<scount;i++) {
          sim=1000*(1-worddist(i,scount,presence,wordcount))
          sub(/,/,".",sim)
          print curlab, slabel[i], sim
        }
      } else if (scount) {
        print "Invalid FASTA format in file " FILENAME ", line " FNR \
          > "/dev/stderr"
        Errorcheck=1
        exit 1
      }
    }
    # Counts words in a sequence of characters
    function countwords(wlen,thisseq,seqnum,parray,wcarray, \
      maxlen,i,thisword) {
      maxlen=length(thisseq)-wlen+1
      for (i=1;i<=maxlen;i++) {
        parray[thisword=substr(thisseq,i,wlen)]++
        wcarray[seqnum,thisword]++
      }
    }
    # Computes Bray-Curtis distance between word counts
    function worddist(i,j,parray,wcarray,	k,count1, \
      count2,sum,dist) {
      for (k in parray) {
        count1=wcarray[i,k]+0
        count2=wcarray[j,k]+0
        if (count1>count2)
          dist+=(count1-count2)
        else if (count2>count1)
          dist+=(count2-count1)
        sum+=(count1+count2)
      }
      return dist/sum
    }
  '
  [ "$convert" = "-tolower" ] && isprot=0 || isprot=1
  scorefile="$(trunk_name "$infile")_$(trunk_name "$0").sim"
  echo "Computing pair-wise word-count similarity scores..."
  echo
  if ! awk -v wordlength=$woco -v protein="$isprot" \
    "$awkscript" "$infile" > "$scorefile"; then
    echo "Unable to compute word-count similarity scores." >&2
    echo "Examine input file or your awk implementation." >&2
    exit 1  
  fi
fi

########################################################################

if [ -z "$matrix" ] || [ $printmat ]; then # Print BLOSUM matrix
  blosum='#  Blosum80
#  Matrix made by matblas from blosum80.iij
#  * column uses minimum score
#  BLOSUM Clustered Scoring Matrix in 1/3 Bit Units
#  Blocks Database = /data/blocks_5.0/blocks.dat
#  Cluster Percentage: >= 80
#  Entropy =   0.9868, Expected =  -0.7442
GAP-TRUNCATION-LENGTH=	10
GAP-DECAY-LENGTH=	5
GAP-PENALTIES=	12	6	0
	A	R	N	D	C	Q	E	G	H	I	L	K	M	F	P	S	T	W	Y	V	B	Z	X	?	a	g	t	c	u	r	y	m	k	s	w	h	b	v	d	]	n
A	7	-3	-3	-3	-1	-2	-2	0	-3	-3	-3	-1	-2	-4	-1	2	0	-5	-4	-1	-3	-2	-1	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9
R	-3	9	-1	-3	-6	1	-1	-4	0	-5	-4	3	-3	-5	-3	-2	-2	-5	-4	-4	-2	0	-2	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9
N	-3	-1	9	2	-5	0	-1	-1	1	-6	-6	0	-4	-6	-4	1	0	-7	-4	-5	5	-1	-2	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9
D	-3	-3	2	10	-7	-1	2	-3	-2	-7	-7	-2	-6	-6	-3	-1	-2	-8	-6	-6	6	1	-3	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9
C	-1	-6	-5	-7	13	-5	-7	-6	-7	-2	-3	-6	-3	-4	-6	-2	-2	-5	-5	-2	-6	-7	-4	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9
Q	-2	1	0	-1	-5	9	3	-4	1	-5	-4	2	-1	-5	-3	-1	-1	-4	-3	-4	-1	5	-2	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9
E	-2	-1	-1	2	-7	3	8	-4	0	-6	-6	1	-4	-6	-2	-1	-2	-6	-5	-4	1	6	-2	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9
G	0	-4	-1	-3	-6	-4	-4	9	-4	-7	-7	-3	-5	-6	-5	-1	-3	-6	-6	-6	-2	-4	-3	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9
H	-3	0	1	-2	-7	1	0	-4	12	-6	-5	-1	-4	-2	-4	-2	-3	-4	3	-5	-1	0	-2	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9
I	-3	-5	-6	-7	-2	-5	-6	-7	-6	7	2	-5	2	-1	-5	-4	-2	-5	-3	4	-6	-6	-2	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9
L	-3	-4	-6	-7	-3	-4	-6	-7	-5	2	6	-4	3	0	-5	-4	-3	-4	-2	1	-7	-5	-2	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9
K	-1	3	0	-2	-6	2	1	-3	-1	-5	-4	8	-3	-5	-2	-1	-1	-6	-4	-4	-1	1	-2	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9
M	-2	-3	-4	-6	-3	-1	-4	-5	-4	2	3	-3	9	0	-4	-3	-1	-3	-3	1	-5	-3	-2	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9
F	-4	-5	-6	-6	-4	-5	-6	-6	-2	-1	0	-5	0	10	-6	-4	-4	0	4	-2	-6	-6	-3	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9
P	-1	-3	-4	-3	-6	-3	-2	-5	-4	-5	-5	-2	-4	-6	12	-2	-3	-7	-6	-4	-4	-2	-3	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9
S	2	-2	1	-1	-2	-1	-1	-1	-2	-4	-4	-1	-3	-4	-2	7	2	-6	-3	-3	0	-1	-1	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9
T	0	-2	0	-2	-2	-1	-2	-3	-3	-2	-3	-1	-1	-4	-3	2	8	-5	-3	0	-1	-2	-1	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9
W	-5	-5	-7	-8	-5	-4	-6	-6	-4	-5	-4	-6	-3	0	-7	-6	-5	16	3	-5	-8	-5	-5	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9
Y	-4	-4	-4	-6	-5	-3	-5	-6	3	-3	-2	-4	-3	4	-6	-3	-3	3	11	-3	-5	-4	-3	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9
V	-1	-4	-5	-6	-2	-4	-4	-6	-5	4	1	-4	1	-2	-4	-3	0	-5	-3	7	-6	-4	-2	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9
B	-3	-2	5	6	-6	-1	1	-2	-1	-6	-7	-1	-5	-6	-4	0	-1	-8	-5	-6	6	0	-3	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9
Z	-2	0	-1	1	-7	5	6	-4	0	-6	-5	1	-3	-6	-2	-1	-2	-5	-4	-4	0	6	-1	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9
X	-1	-2	-2	-3	-4	-2	-2	-3	-2	-2	-2	-2	-2	-3	-3	-1	-1	-5	-3	-2	-3	-1	-2	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9
?	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9
a	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	4	-2	-2	-2	-2	0	-2	0	-2	-2	0	0	0	0	0	-9	0
g	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-2	4	-2	-2	-2	0	-2	-2	0	0	-2	0	0	0	0	-9	0
t	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-2	-2	4	-2	4	-2	0	-2	0	-2	0	0	0	0	0	-9	0
c	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-2	-2	-2	4	-2	-2	0	0	-2	0	-2	0	0	0	0	-9	0
u	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-2	-2	4	-2	4	-2	0	-2	0	-2	0	0	0	0	0	-9	0
r	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	0	0	-2	-2	-2	2	4	1	1	1	1	0	0	0	0	-9	0
y	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-2	-2	0	0	0	4	2	1	1	1	1	0	0	0	0	-9	0
m	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	0	-2	-2	0	-2	1	1	2	-2	1	1	0	0	0	0	-9	0
k	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-2	0	0	-2	0	1	1	-2	2	1	1	0	0	0	0	-9	0
s	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-2	0	-2	0	-2	1	1	1	1	2	-2	0	0	0	0	-9	0
w	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	0	-2	0	-2	0	1	1	1	1	-2	2	0	0	0	0	-9	0
h	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	-9	0
b	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	-9	0
v	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	-9	0
d	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	-9	0
]	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9
n	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	-9	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	-9	0
'
  if [ $printmat ]; then
    echo "$blosum"
    exit
  else
    echo "Blosum matrix not provided, creating temporary default matrix file..."
    echo
    tmpmat="$(trunk_name "$0")_blosum_$$.tmp"
    echo "$blosum" > "$tmpmat"
    matrix="$tmpmat"
  fi
fi

########################################################################

[ "$infile" ] && pogfile="$(trunk_name "$infile")" || pogfile="$(trunk_name "$filterfile")"
outfile="${pogfile}_poa.out"
fastafile="${pogfile}_poa.fas"
pogfile="${pogfile}_poa.pog"

fastatmpfile="$(trunk_name "$0")_fasta_$$.tmp"

[ "$scorefile" ] && readscores="-do_progressive -read_pairscores" || readscores=

[ "$msafile" ] && readmsa="-read_msa" || readmsa=

[ "$tmpfilter" ] && remove="-remove" || remove=

[ "$infile" ] && readfasta="-read_fasta" || readfasta=

echo "Starting POA:"
echo "$poa" $readfasta "$infile" -preserve_seqorder $convert \
        $readmsa "$msafile" $remove "$tmpfilter" $readscores "$scorefile" -v \
        -pir "$fastatmpfile" -po "$pogfile" "$matrix"
echo

if eval "$poa" $readfasta "$infile" -preserve_seqorder $convert \
        $readmsa "$msafile" $remove "$tmpfilter" $readscores "$scorefile" -v \
        -pir "$fastatmpfile" -po "$pogfile" "$matrix" &> "$outfile" && 
   ! grep -Fq 'Error generating pair scores' "$outfile"

then

  echo "POA run finished."
  echo
  
  # Create a nice FASTA output file  
  [ "$isgbk" -o "$clean" ] && replace="s/\(>\)\([^ ]\+ \)\(.\+\)$/\1\3/" ||
    replace="s/ untitled$//"
  if sed '
      v
      /^>/ '"$replace"'
      /^>/ ! y/./-/
    ' "$fastatmpfile" > "$fastafile" 2> /dev/null; then
    rm -f "$fastatmpfile"
  else
    echo "WARNING: Cleaning $fastafile did not work. Examine your sed implementation." >&2
    echo
    mv "$fastatmpfile" "$fastafile"
  fi
  
  # Print resulting file names  
  echo "Results are in:"
  echo "	$fastafile	Aligned sequences in FASTA format"
  if [ -z "$delete" ]; then
    echo "	$pogfile	Aligned sequences in partial order graph format"
    echo "	$outfile	Information about the POA run"
    [ "$taxonomy" -o $woco -gt 0 -o "$doblast" ] && 
      echo "	$scorefile	Pairwise similarity scores"
    if [ "$isgbk" ] || [ "$clean" ]; then
      [ -z "$keeptmp" ] &&  rm -f "$infile" ||
        echo "	$infile	Intermediarily generated FASTA file"
    fi
  else # Delete all files except aligned sequences in FASTA format
    rm -f "$pogfile" "$outfile"
    [ "$taxonomy" -o $woco -gt 0 -o "$doblast" ] && rm -f "$scorefile"
    [ "$isgbk" -o "$clean" ] && rm -f "$infile"
  fi

else

  echo "$poa did not result. Examine $outfile and input files." >&2
  rm -f "$fastatmpfile" 

fi

[ "$matrix" = "$tmpmat" ] && rm -f "$tmpmat" # Remove matrix file if created

[ "$tmpfilter" ] && rm -f "$tmpfilter"

echo
