Source code for propy.QuasiSequenceOrder

# -*- coding: utf-8 -*-
"""
Compute the quasi sequence order descriptors based on the given protein
sequence. We can obtain two types of descriptors: Sequence-order-coupling
number and quasi-sequence-order descriptors. Two distance matrixes between 20
amino acids are employed.

References
----------
.. [1] Kuo-Chen Chou. Prediction of Protein Subcellar Locations by Incorporating
       Quasi-Sequence-Order Effect. Biochemical and Biophysical Research
       Communications 2000, 278, 477-483.

.. [2] Kuo-Chen Chou and Yu-Dong Cai. Prediction of Protein sucellular locations by
       GO-FunD-PseAA predictor, Biochemical and Biophysical Research
       Communications, 2004, 320, 1236-1239.

.. [3] Gisbert Schneider and Paul wrede. The Rational Design of Amino Acid
       Sequences by Artifical Neural Networks and Simulated Molecular
       Evolution: Do Novo Design of an Idealized Leader Cleavge Site. Biophys
       Journal, 1994, 66, 335-344.

Authors: Dongsheng Cao and Yizeng Liang.
Date: 2012.09.03
Email: oriental-cds@163.com
"""

# Core Library
import json
import math
from typing import Any, Dict

# Third party
from pkg_resources import resource_filename

# First party
from propy import AALetter

# Distance is the Schneider-Wrede physicochemical distance matrix
# used by Chou et. al.
filepath = resource_filename(
    __name__, "data/schneider-wrede-physicochemical-distance-matrix.json"
)
with open(filepath, "r") as f:
    _Distance1: Dict[str, float] = json.load(f)

# Distance is the Grantham chemical distance matrix used by Grantham et. al.
filepath = resource_filename(__name__, "data/grantham-chemical-distance-matrix.json")
with open(filepath, "r") as f:
    _Distance2: Dict[str, float] = json.load(f)


[docs]def GetSequenceOrderCouplingNumber( ProteinSequence: str, d: int = 1, distancematrix: Dict[str, float] = _Distance1 ): """ Compute the dth-rank sequence order coupling number for a protein. Parameters ---------- ProteinSequence : str a pure protein sequence d : int the gap between two amino acids. distancematrix : Dict[str, float] Returns ------- tau : float Example ------- >>> from propy.GetProteinFromUniprot import GetProteinSequence >>> protein = GetProteinSequence(ProteinID="Q9NQ39") >>> result = GetSequenceOrderCouplingNumber(protein) """ NumProtein = len(ProteinSequence) tau = 0.0 for i in range(NumProtein - d): temp1 = ProteinSequence[i] temp2 = ProteinSequence[i + d] tau = tau + math.pow(distancematrix[temp1 + temp2], 2) return round(tau, 3)
[docs]def GetSequenceOrderCouplingNumberp( ProteinSequence: str, maxlag: int = 30, distancematrix: Dict[Any, Any] = None ): """ Compute the sequence order coupling numbers from 1 to maxlag for a given protein sequence based on the user-defined property. Parameters ---------- ProteinSequence : str a pure protein sequence maxlag : int, optional (default: 30) the maximum lag and the length of the protein should be larger than maxlag. distancematrix : Dict[Any, Any] contains 400 distance values Returns ------- Tau : Dict[str] contains all sequence order coupling numbers based on the given property Examples -------- >>> from propy.GetProteinFromUniprot import GetProteinSequence >>> protein = GetProteinSequence(ProteinID="Q9NQ39") >>> result = GetSequenceOrderCouplingNumberp(protein) """ if distancematrix is None: distancematrix = {} # NumProtein = len(ProteinSequence) # TODO: this was calculated, but not # used... is here a bug? Tau = {} for i in range(maxlag): Tau["tau" + str(i + 1)] = GetSequenceOrderCouplingNumber( ProteinSequence, i + 1, distancematrix ) return Tau
[docs]def GetSequenceOrderCouplingNumberSW( ProteinSequence: str, maxlag: int = 30, distancematrix=_Distance1 ): """ Compute the sequence order coupling numbers from 1 to maxlag for a given protein sequence based on the Schneider-Wrede physicochemical distance matrix. Parameters ---------- ProteinSequence : str a pure protein sequence maxlag : int, optional (default: 30) the maximum lag and the length of the protein should be larger than maxlag distancematrix : Dict[Any, Any] contains Schneider-Wrede physicochemical distance matrix Returns ------- Tau : Dict[Any, Any] contains all sequence order coupling numbers based on the Schneider-Wrede physicochemical distance matrix Examples -------- >>> from propy.GetProteinFromUniprot import GetProteinSequence >>> protein = GetProteinSequence(ProteinID="Q9NQ39") >>> result = GetSequenceOrderCouplingNumberSW(protein) """ # NumProtein = len(ProteinSequence) # TODO: This was calculated, but not # used ... is here a bug? Tau = {} for i in range(maxlag): Tau["tausw" + str(i + 1)] = GetSequenceOrderCouplingNumber( ProteinSequence, i + 1, distancematrix ) return Tau
[docs]def GetSequenceOrderCouplingNumberGrant( ProteinSequence: str, maxlag: int = 30, distancematrix=_Distance2 ): """ Compute the sequence order coupling numbers from 1 to maxlag for a given protein sequence based on the Grantham chemical distance matrix. Parameters ---------- ProteinSequence : str a pure protein sequence maxlag : int, optional (default: 30) the maximum lag and the length of the protein should be larger than maxlag distancematrix : Dict[Any, Any] contains Schneider-Wrede physicochemical distance matrix Returns ------- Tau : Dict[Any, Any] contains all sequence order coupling numbers based on the Grantham chemical distance matrix Examples -------- >>> from propy.GetProteinFromUniprot import GetProteinSequence >>> protein = GetProteinSequence(ProteinID="Q9NQ39") >>> result = GetSequenceOrderCouplingNumberGrant(protein) """ # NumProtein = len(ProteinSequence) # TODO: This was calculated, but not # used ... is here a bug? Tau = {} for i in range(maxlag): Tau["taugrant" + str(i + 1)] = GetSequenceOrderCouplingNumber( ProteinSequence, i + 1, distancematrix ) return Tau
[docs]def GetSequenceOrderCouplingNumberTotal( ProteinSequence: str, maxlag: int = 30 ) -> Dict[Any, Any]: """ Compute the sequence order coupling numbers from 1 to maxlag for a given protein sequence. Parameters ---------- ProteinSequence : str a pure protein sequence maxlag : int, optional (default: 30) the maximum lag and the length of the protein should be larger Returns ------- result : Dict contains all sequence order coupling numbers Examples -------- >>> from propy.GetProteinFromUniprot import GetProteinSequence >>> protein = GetProteinSequence(ProteinID="Q9NQ39") >>> result = GetSequenceOrderCouplingNumberTotal(protein) """ Tau: Dict[Any, Any] = {} Tau.update(GetSequenceOrderCouplingNumberSW(ProteinSequence, maxlag=maxlag)) Tau.update(GetSequenceOrderCouplingNumberGrant(ProteinSequence, maxlag=maxlag)) return Tau
[docs]def GetAAComposition(ProteinSequence: str) -> Dict[str, float]: """ Calculate the composition of Amino acids for a given protein sequence. Parameters ---------- ProteinSequence : str a pure protein sequence Returns ------- result : Dict[str, float] contains the composition of 20 amino acids. Examples -------- >>> from propy.GetProteinFromUniprot import GetProteinSequence >>> from propy.AAComposition import CalculateAAComposition >>> protein = GetProteinSequence(ProteinID="Q9NQ39") >>> result = CalculateAAComposition(protein) """ LengthSequence = len(ProteinSequence) result: Dict[str, float] = {} for i in AALetter: result[i] = round(float(ProteinSequence.count(i)) / LengthSequence, 3) return result
[docs]def GetQuasiSequenceOrder1( ProteinSequence: str, maxlag: int = 30, weight: float = 0.1, distancematrix=None ): """ Compute the first 20 quasi-sequence-order descriptors for a given protein sequence. Parameters ---------- ProteinSequence : str a pure protein sequence maxlag : int, optional (default: 30) the maximum lag and the length of the protein should be larger Examples -------- >>> from propy.GetProteinFromUniprot import GetProteinSequence >>> protein = GetProteinSequence(ProteinID="Q9NQ39") >>> result = GetQuasiSequenceOrder1(protein) see :py:func:`GetQuasiSequenceOrder` for the choice of parameters. """ if distancematrix is None: distancematrix = {} rightpart = 0.0 for i in range(maxlag): rightpart = rightpart + GetSequenceOrderCouplingNumber( ProteinSequence, i + 1, distancematrix ) AAC = GetAAComposition(ProteinSequence) result: Dict[str, float] = {} temp = 1 + weight * rightpart for index, aaletter_char in enumerate(AALetter): result["QSO" + str(index + 1)] = round(AAC[aaletter_char] / temp, 6) return result
[docs]def GetQuasiSequenceOrder2( ProteinSequence: str, maxlag=30, weight=0.1, distancematrix=None ): """ Compute the last maxlag quasi-sequence-order descriptors for a given protein sequence. Parameters ---------- ProteinSequence : str a pure protein sequence maxlag : int, optional (default: 30) the maximum lag and the length of the protein should be larger Examples -------- >>> from propy.GetProteinFromUniprot import GetProteinSequence >>> protein = GetProteinSequence(ProteinID="Q9NQ39") >>> result = GetQuasiSequenceOrder2(protein) see :py:func:`GetQuasiSequenceOrder` for the choice of parameters. """ if distancematrix is None: distancematrix = {} rightpart = [] for i in range(maxlag): rightpart.append( GetSequenceOrderCouplingNumber(ProteinSequence, i + 1, distancematrix) ) # AAC = GetAAComposition(ProteinSequence) # TODO: was not used. Bug? result = {} temp = 1 + weight * sum(rightpart) for index in range(20, 20 + maxlag): result["QSO" + str(index + 1)] = round(weight * rightpart[index - 20] / temp, 6) return result
[docs]def GetQuasiSequenceOrder1SW( ProteinSequence: str, maxlag=30, weight=0.1, distancematrix=_Distance1 ): """ Compute the first 20 quasi-sequence-order descriptors for a given protein sequence. Parameters ---------- ProteinSequence : str a pure protein sequence maxlag : int, optional (default: 30) the maximum lag and the length of the protein should be larger Examples -------- >>> from propy.GetProteinFromUniprot import GetProteinSequence >>> protein = GetProteinSequence(ProteinID="Q9NQ39") >>> result = GetQuasiSequenceOrder1SW(protein) see :py:func:`GetQuasiSequenceOrder` for the choice of parameters. """ rightpart = 0.0 for i in range(maxlag): rightpart = rightpart + GetSequenceOrderCouplingNumber( ProteinSequence, i + 1, distancematrix ) AAC = GetAAComposition(ProteinSequence) result = {} temp = 1 + weight * rightpart for index, aaletter_char in enumerate(AALetter): result["QSOSW" + str(index + 1)] = round(AAC[aaletter_char] / temp, 6) return result
[docs]def GetQuasiSequenceOrder2SW( ProteinSequence: str, maxlag=30, weight=0.1, distancematrix=_Distance1 ): """ Compute the last maxlag quasi-sequence-order descriptors for a given protein sequence. Parameters ---------- ProteinSequence : str a pure protein sequence maxlag : int, optional (default: 30) the maximum lag and the length of the protein should be larger Examples -------- >>> from propy.GetProteinFromUniprot import GetProteinSequence >>> protein = GetProteinSequence(ProteinID="Q9NQ39") >>> result = GetQuasiSequenceOrder2SW(protein) see :py:func:`GetQuasiSequenceOrder` for the choice of parameters. """ rightpart = [] for i in range(maxlag): rightpart.append( GetSequenceOrderCouplingNumber(ProteinSequence, i + 1, distancematrix) ) # AAC = GetAAComposition(ProteinSequence) # TODO: was not used. Bug? result = {} temp = 1 + weight * sum(rightpart) for index in range(20, 20 + maxlag): result["QSOSW" + str(index + 1)] = round( weight * rightpart[index - 20] / temp, 6 ) return result
[docs]def GetQuasiSequenceOrder1Grant( ProteinSequence: str, maxlag: int = 30, weight: float = 0.1, distancematrix=_Distance2, ): """ Compute the first 20 quasi-sequence-order descriptors for a given protein sequence. Parameters ---------- ProteinSequence : str a pure protein sequence maxlag : int, optional (default: 30) the maximum lag and the length of the protein should be larger Examples -------- >>> from propy.GetProteinFromUniprot import GetProteinSequence >>> protein = GetProteinSequence(ProteinID="Q9NQ39") >>> result = GetQuasiSequenceOrder1Grant(protein) see :py:func:`GetQuasiSequenceOrder` for the choice of parameters. """ rightpart = 0.0 for i in range(maxlag): rightpart = rightpart + GetSequenceOrderCouplingNumber( ProteinSequence, i + 1, distancematrix ) AAC = GetAAComposition(ProteinSequence) result = {} temp = 1 + weight * rightpart for index, aaletter_char in enumerate(AALetter): result["QSOgrant" + str(index + 1)] = round(AAC[aaletter_char] / temp, 6) return result
[docs]def GetQuasiSequenceOrder2Grant( ProteinSequence: str, maxlag: int = 30, weight: float = 0.1, distancematrix=_Distance2, ): """ Compute the last maxlag quasi-sequence-order descriptors for a given protein sequence. Parameters ---------- ProteinSequence : str a pure protein sequence maxlag : int, optional (default: 30) the maximum lag and the length of the protein should be larger Examples -------- >>> from propy.GetProteinFromUniprot import GetProteinSequence >>> protein = GetProteinSequence(ProteinID="Q9NQ39") >>> result = GetQuasiSequenceOrder2Grant(protein) see :py:func:`GetQuasiSequenceOrder` for the choice of parameters. """ rightpart = [] for i in range(maxlag): rightpart.append( GetSequenceOrderCouplingNumber(ProteinSequence, i + 1, distancematrix) ) # AAC = GetAAComposition(ProteinSequence) # TODO: Was not used. Bug? result = {} temp = 1 + weight * sum(rightpart) for index in range(20, 20 + maxlag): result["QSOgrant" + str(index + 1)] = round( weight * rightpart[index - 20] / temp, 6 ) return result
[docs]def GetQuasiSequenceOrder( ProteinSequence: str, maxlag: int = 30, weight: float = 0.1 ) -> Dict[Any, Any]: """ Compute quasi-sequence-order descriptors for a given protein. See [1]_ for details. Parameters ---------- ProteinSequence : str a pure protein sequence maxlag : int, optional (default: 30) the maximum lag and the length of the protein should be larger than maxlag weight : float, optional (default: 0.1) a weight factor. Please see reference 1 for its choice. Returns ------- result : Dict contains all quasi-sequence-order descriptors Examples -------- >>> from propy.GetProteinFromUniprot import GetProteinSequence >>> protein = GetProteinSequence(ProteinID="Q9NQ39") >>> result = GetQuasiSequenceOrder(protein) """ result: Dict[Any, Any] = dict() result.update(GetQuasiSequenceOrder1SW(ProteinSequence, maxlag, weight, _Distance1)) result.update(GetQuasiSequenceOrder2SW(ProteinSequence, maxlag, weight, _Distance1)) result.update( GetQuasiSequenceOrder1Grant(ProteinSequence, maxlag, weight, _Distance2) ) result.update( GetQuasiSequenceOrder2Grant(ProteinSequence, maxlag, weight, _Distance2) ) return result
[docs]def GetQuasiSequenceOrderp( ProteinSequence: str, maxlag: int = 30, weight: float = 0.1, distancematrix: Dict[Any, Any] = None, ) -> Dict[Any, Any]: """ Compute quasi-sequence-order descriptors for a given protein. See [1]_ for details. Parameters ---------- ProteinSequence : str a pure protein sequence maxlag : int, optional (default: 30) the maximum lag and the length of the protein should be larger than maxlag weight : float, optional (default: 0.1) a weight factor. Please see reference 1 for its choice. distancematrix : Dict[Any, Any] contains 400 distance values Returns ------- result : Dict[Any, Any] contains all quasi-sequence-order descriptors Examples -------- >>> from propy.GetProteinFromUniprot import GetProteinSequence >>> protein = GetProteinSequence(ProteinID="Q9NQ39") >>> result = GetQuasiSequenceOrderp(protein) """ if distancematrix is None: distancematrix = {} result: Dict[Any, Any] = dict() result.update( GetQuasiSequenceOrder1(ProteinSequence, maxlag, weight, distancematrix) ) result.update( GetQuasiSequenceOrder2(ProteinSequence, maxlag, weight, distancematrix) ) return result