"""Collection of miscelanous tool

   NAME   : misc.py
   TYPE   : module
   PYTHON : 1.5.2b2
   VERSION: 1.0.1
   AUTHOR : Arne Mueller
   DATE   : 09.07.99

   CHANGES: Wed Feb  9 14:44:29 GMT 2000
   o fixed AlignmentContainer.alphabet

   CHANGES VERSION 1.0.1
   DATE: Tue Apr  4 18:29:05 BST 2000
   o fixed another bug in AlignmentContainer.alphabet
   o KeyError's are caught when one of the sequences contains
     an invalid residue letter, it's just ignored for an identity
     test.
"""

import math
import sys

def hssp(L, n=0):
    """Calculates the hssp threshold for a given length"""
    ### Descibed in: Rost, Prot. Eng. Vol 2, No 2, pp85-94, 1999  
    if L > 417: L = 417.0
    try:
        res =  n + 510.0 * math.pow(L, -0.32 * (1.0 + math.exp(-L/1000.0)))
    except OverflowError:
        sys.stderr.write('OverflowError in method hssp: @ L = %d\n'%L)
        raise OverflowError
    return res

def nInt(x):
    """args: float x, returns the nearest integer of x"""
    if x > 0.0:
        x = x + 0.5 
    elif x < 0.0:
        x = x - 0.5
    return int(x)

def residueShare(a_start, a_end, b_start, b_end):
    """Calculates the overlap in residues between sequence a and b
       paramters: a_start, a_end, b_start, b_end 
    """
    share = 0
    if a_start <= b_start and a_end >= b_start:
        if b_end >= a_end:
            share = a_end - b_start + 1
        else:
            share = b_end - b_start + 1
    elif b_start <= a_start and b_end >= a_start:
        if a_end >= b_end:
            share = b_end - a_start + 1
        else:
            share = a_end - a_start + 1
    return share

def overlap(a_start, a_end, b_start, b_end):
    """Calculates the overlap in % of between sequence a and b
       paramters: a_start, a_end, b_start, b_end 
    """
    share = residueShare(a_start, a_end, b_start, b_end)
    overlap = 0
    a_len = a_end - a_start + 1
    if share > 0:
        overlap = share / float(a_len)
    return overlap

class AlignmentContainer:

    def __init__(self, sequences=[]):
        self.alphabet = {'C':'C', 'c':'C', 'H':'H', 'h':'H', 'I':'I', 'i':'I',
                         'M':'M', 'm':'M', 'S':'S', 's':'S', 'V':'V', 'v':'V',
                         'A':'A', 'a':'A', 'G':'G', 'g':'G', 'L':'L', 'l':'L',
                         'P':'P', 'p':'P', 'T':'T', 't':'T', 'F':'F', 'f':'F',
                         'R':'R', 'r':'R', 'Y':'Y', 'y':'Y', 'W':'W', 'w':'W',
                         'D':'D', 'd':'D', 'N':'N', 'n':'N', 'E':'E', 'e':'E',
                         'Q':'Q', 'q':'Q', 'K':'K', 'k':'K', 'B':'B', 'b':'B',
                         'Z':'z', 'z':'Z', 'X': None, 'x': None, '-': None}
        self.sequences = sequences
        ### requirements for redundant sequences
        self.min_id = 80       # minimum %id of
        self.max_offset = 10   # maximum allowed N or C terminal offset
        self.min_overlap = 90  # minimum % overlap
        self.check = 50        # residue position to start 'giving up' check 

    def best(self, a, b):
        ### by pid
        if a.pid > b.pid: return  -1
        if a.pid < b.pid: return   1
        ### by evalue
        if a.evalue > b.evalue: return  1
        if a.evalue < b.evalue: return -1
        ### by length
        if a.length >= b.length: return  -1
        ### equal
        return 0

    def SimilarId(self, a, b):
        ### get overlapping region of both hits
        d_start = abs(a.q_start - b.q_start)
        d_end   = abs(a.q_end - b.q_end)
        if a.q_start <= b.q_start:
            a_start = d_start
            b_start = 0
        else:
            b_start = d_start
            a_start = 0
        if a.q_end >= b.q_end:
            a_end = a.length - d_end
            b_end = b.length
        else:
            b_end = b.length - d_end
            a_end = a.length
        a_aln = a.sequence[a_start:a_end]
        b_aln = b.sequence[b_start:b_end]
        length = len(a_aln)
        ids = 0
        aln_length = 0
        if len(a_aln) != len(b_aln):
            print 'ERROR --- '
            print a.db, a.name, len(a_aln), a_start, a_end, a_aln
            print a.q_start, a.q_end, a.sequence
            print b.db, b.name, len(b_aln), b_start, b_end, b_aln
            print b.q_start, b.q_end, b.sequence
            sys.exit(-1)
        ### speed up things using direct references!
        min_id = self.min_id
        alphabet = self.alphabet
        check = self.check
        for i in xrange(length):
            try:
                residue_a = alphabet[a_aln[i]]
                residue_b = alphabet[b_aln[i]]
            except KeyError:
                continue    
            if not residue_a and not residue_b:
                continue
            aln_length = aln_length + 1
            if i > check and 100*(ids + length - i) / length < min_id:
                break
            if residue_a == residue_b: 
                ids = ids + 1
        if ids:
            pid = nInt(ids*1.0/aln_length*100.0)
            #hcut =  nInt(hssp(aln_length, self.min_id))
            #if hcut <= 100 and hcut <= pid:
            if pid >= min_id:
                return 1
        return 0
    
    def Isredundant(self, b, a):
        if a.length >= b.length:
            ov = nInt(overlap(a.q_start, a.q_end, b.q_start, b.q_end)*100.0)
        else:
            ov = nInt(overlap(b.q_start, b.q_end, a.q_start, a.q_end)*100.0)
        if ov >= self.min_overlap and (
            abs(a.q_start-b.q_start+1) < self.max_offset and
            abs(a.q_end-b.q_end+1) < self.max_offset):
            if self.SimilarId(a, b):
                return 1
        return 0

    def clusterHits(self):
        clusters = []
        hits = self.sequences
        last = len(hits) - 1
        i = 0
        redundant = 0
        while i <= last:
            h = hits[i]
            for cluster in clusters:
                redundant = 1
                for member in cluster:
                    if not self.Isredundant(member, h):
                        redundant = 0
                        break
                if redundant:
                    cluster.append(h)
                    break
            if not redundant:
                clusters.insert(0, [h])
                #clusters.append([h])
            i = i + 1
        for c in clusters:
            c.sort(self.best)
        #print len(clusters), len(sequences)
        clusters.reverse()
        return clusters
        
    def __add__(self, other):
        new = AlignmentContainer(self.sequences + other.sequences)
        return new

