Source code for propy.PseudoAAC

# propy3, formerly protpy, is a Python package to compute protein descriptors
# Copyright (C) 2012 Dongsheng Cao and Yizeng Liang, oriental-cds@163.com
# Copyright (C) 2020-2022 Martin Thoma

# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; in version 2
# of the License.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor,
# Boston, MA  02110-1301, USA.
"""
Instead of using the conventional 20-D amino acid composition to represent the
sample of a protein, Prof. Kuo-Chen Chou proposed the pseudo amino acid (PseAA)
composition in order for inluding the sequence-order information. Based on the
concept of Chou's pseudo amino acid composition, the server PseAA was designed
in a flexible way, allowing users to generate various kinds of pseudo amino
acid composition for a given protein sequence by selecting different parameters
and their combinations. This module aims at computing two types of PseAA
descriptors: Type I and Type II.

References
----------
.. [1] Kuo-Chen Chou. Prediction of Protein Cellular Attributes Using
       Pseudo-Amino Acid Composition. PROTEINS: Structure, Function, and
       Genetics, 2001, 43: 246-255.

.. [2] http://www.csbio.sjtu.edu.cn/bioinf/PseAAC/
.. [3] http://www.csbio.sjtu.edu.cn/bioinf/PseAAC/type2.htm
.. [4] Kuo-Chen Chou. Using amphiphilic pseudo amino acid composition to
       predict enzyme subfamily classes. Bioinformatics, 2005, 21, 10-19.


The hydrophobicity values are from JACS, 1962, 84: 4240-4246. (C. Tanford).

The hydrophilicity values are from PNAS, 1981, 78:3824-3828 (T.P.Hopp & K.R.Woods).

The side-chain mass for each of the 20 amino acids.

CRC Handbook of Chemistry and Physics, 66th ed., CRC Press, Boca Raton, Florida (1985).

R.M.C. Dawson, D.C. Elliott, W.H. Elliott, K.M. Jones,
Data for Biochemical Research 3rd ed.,

Clarendon Press Oxford (1986).
"""

# Core Library
import json
import math
from typing import Any, Dict

# Third party
from pkg_resources import resource_filename

# First party
from propy import AALetter

with open(resource_filename(__name__, "data/hydrophobicity.json"), "r") as f:
    _Hydrophobicity: Dict[str, float] = json.load(f)

with open(resource_filename(__name__, "data/hydrophilicity.json"), "r") as f:
    _hydrophilicity: Dict[str, float] = json.load(f)

with open(resource_filename(__name__, "data/residuemass.json"), "r") as f:
    _residuemass: Dict[str, float] = json.load(f)


with open(resource_filename(__name__, "data/pK1.json"), "r") as f:
    _pK1: Dict[str, float] = json.load(f)

with open(resource_filename(__name__, "data/pK2.json"), "r") as f:
    _pK2: Dict[str, float] = json.load(f)

with open(resource_filename(__name__, "data/pI.json"), "r") as f:
    _pI: Dict[str, float] = json.load(f)


def _mean(listvalue):
    """
    The mean value of the list data.

    Examples
    --------
    >>> _mean(listvalue=[1, 2, 3])
    2.0
    """
    return sum(listvalue) / len(listvalue)


def _std(listvalue, ddof=1):
    """
    The standard deviation of the list data.

    Examples
    --------
    >>> _std(listvalue=[1, 2, 3])
    1.0
    """
    mean = _mean(listvalue)
    temp = [math.pow(i - mean, 2) for i in listvalue]
    res = math.sqrt(sum(temp) / (len(listvalue) - ddof))
    return res


[docs]def NormalizeEachAAP(AAP): """ All of the amino acid indices are centralized and standardized before the calculation. Parameters ---------- AAP is a dict form containing the properties of 20 amino acids. Returns ------- result is the a dict form containing the normalized properties of 20 amino acids. Examples -------- >>> result = NormalizeEachAAP(AAP=_Hydrophobicity) """ if len(list(AAP.values())) != 20: print("You can not input the correct number of properities of Amino acids!") else: result = {} for i, j in list(AAP.items()): result[i] = (j - _mean(list(AAP.values()))) / _std( list(AAP.values()), ddof=0 ) return result
# Type I descriptors########################################################### # Pseudo-Amino Acid Composition descriptors#################################### def _GetCorrelationFunction( Ri="S", Rj="D", AAP=(_Hydrophobicity, _hydrophilicity, _residuemass) ): """ Computing the correlation between two given amino acids using the above three properties. Parameters ---------- Ri and Rj are the amino acids, respectively. Returns ------- result is the correlation value between two amino acids. Examples -------- >>> result = _GetCorrelationFunction(Ri="S", Rj="D") """ Hydrophobicity = NormalizeEachAAP(AAP[0]) hydrophilicity = NormalizeEachAAP(AAP[1]) residuemass = NormalizeEachAAP(AAP[2]) theta1 = math.pow(Hydrophobicity[Ri] - Hydrophobicity[Rj], 2) theta2 = math.pow(hydrophilicity[Ri] - hydrophilicity[Rj], 2) theta3 = math.pow(residuemass[Ri] - residuemass[Rj], 2) theta = round((theta1 + theta2 + theta3) / 3.0, 3) return theta def _GetSequenceOrderCorrelationFactor(ProteinSequence: str, k: int = 1) -> float: """ Computing the Sequence order correlation factor with gap equal to k based on [_Hydrophobicity, _hydrophilicity, _residuemass]. Parameters ---------- ProteinSequence : str a pure protein sequence. k : int is the gap. Returns ------- result : float the correlation factor value with the gap equal to k Examples -------- >>> from propy.GetProteinFromUniprot import GetProteinSequence >>> protein = GetProteinSequence(ProteinID="Q9NQ39") >>> result = _GetSequenceOrderCorrelationFactor(protein) """ LengthSequence = len(ProteinSequence) res = [] for i in range(LengthSequence - k): AA1 = ProteinSequence[i] AA2 = ProteinSequence[i + k] res.append(_GetCorrelationFunction(AA1, AA2)) result = round(sum(res) / (LengthSequence - k), 3) return result
[docs]def GetAAComposition(ProteinSequence: str) -> Dict[Any, Any]: """ Calculate the composition of Amino acids for a given protein sequence. Parameters ---------- ProteinSequence : str a pure protein sequence Returns ------- result is a dict form containing the composition of 20 amino acids. Examples -------- >>> from propy.GetProteinFromUniprot import GetProteinSequence >>> protein = GetProteinSequence(ProteinID="Q9NQ39") >>> result = GetAAComposition(protein) """ LengthSequence = len(ProteinSequence) Result = {} for i in AALetter: Result[i] = round(float(ProteinSequence.count(i)) / LengthSequence * 100, 3) return Result
def _GetPseudoAAC1(ProteinSequence, lamda=10, weight=0.05): """ Computing the first 20 of type I pseudo-amino acid compostion descriptors based on [_Hydrophobicity, _hydrophilicity, _residuemass]. """ rightpart = 0.0 for i in range(lamda): rightpart = rightpart + _GetSequenceOrderCorrelationFactor( ProteinSequence, k=i + 1 ) AAC = GetAAComposition(ProteinSequence) result = {} temp = 1 + weight * rightpart for index, char in enumerate(AALetter): result["PAAC" + str(index + 1)] = round(AAC[char] / temp, 3) return result def _GetPseudoAAC2(ProteinSequence, lamda=10, weight=0.05): """ Computing the last lamda of type I pseudo-amino acid compostion descriptors based on [_Hydrophobicity, _hydrophilicity, _residuemass]. """ rightpart = [] for i in range(lamda): rightpart.append(_GetSequenceOrderCorrelationFactor(ProteinSequence, k=i + 1)) result = {} temp = 1 + weight * sum(rightpart) for index in range(20, 20 + lamda): result["PAAC" + str(index + 1)] = round( weight * rightpart[index - 20] / temp * 100, 3 ) return result def _GetPseudoAAC( ProteinSequence: str, lamda: int = 10, weight: float = 0.05 ) -> Dict[Any, Any]: """ Computing all of type I pseudo-amino acid compostion descriptors based on three given properties. Note that the number of PAAC strongly depends on the lamda value. if lamda = 20, we can obtain 20+20=40 PAAC descriptors. The size of these values depends on the choice of lamda and weight simultaneously. AAP=[_Hydrophobicity, _hydrophilicity, _residuemass] Parameters ---------- ProteinSequence : str a pure protein sequence lamda : int reflects the rank of correlation and is a non-Negative integer, such as 15. Note that (1) lamda should NOT be larger than the length of input protein sequence; (2) lamda must be non-Negative integer, such as 0, 1, 2, ...; (3) when lamda =0, the output of PseAA server is the 20-D amino acid composition. weight factor : float is designed for the users to put weight on the additional PseAA components with respect to the conventional AA components. The user can select any value within the region from 0.05 to 0.7 for the weight factor. Returns ------- result : Dict[Any, Any] contains calculated 20+lamda PAAC descriptors Examples -------- >>> from propy.GetProteinFromUniprot import GetProteinSequence >>> protein = GetProteinSequence(ProteinID="Q9NQ39") >>> result = _GetPseudoAAC(protein) """ res: Dict[Any, Any] = {} res.update(_GetPseudoAAC1(ProteinSequence, lamda=lamda, weight=weight)) res.update(_GetPseudoAAC2(ProteinSequence, lamda=lamda, weight=weight)) return res # Type II descriptors########################################################## # Amphiphilic Pseudo-Amino Acid Composition descriptors######################## def _GetCorrelationFunctionForAPAAC( Ri="S", Rj="D", AAP=(_Hydrophobicity, _hydrophilicity) ): """ Computing the correlation between two given amino acids using the above two properties for APAAC (type II PseAAC). Parameters ---------- Ri and Rj are the amino acids, respectively. Returns ------- result : the correlation value between two amino acids Examples -------- >>> result = _GetCorrelationFunctionForAPAAC(Ri="S", Rj="D") """ Hydrophobicity = NormalizeEachAAP(AAP[0]) hydrophilicity = NormalizeEachAAP(AAP[1]) theta1 = round(Hydrophobicity[Ri] * Hydrophobicity[Rj], 3) theta2 = round(hydrophilicity[Ri] * hydrophilicity[Rj], 3) return theta1, theta2
[docs]def GetSequenceOrderCorrelationFactorForAPAAC(ProteinSequence, k=1): """ Computing the Sequence order correlation factor with gap equal to k based on [_Hydrophobicity, _hydrophilicity] for APAAC (type II PseAAC) . Parameters ---------- ProteinSequence : str a pure protein sequence k is the gap. Returns ------- result is the correlation factor value with the gap equal to k. Examples -------- >>> from propy.GetProteinFromUniprot import GetProteinSequence >>> protein = GetProteinSequence(ProteinID="Q9NQ39") >>> result = GetSequenceOrderCorrelationFactorForAPAAC(protein) """ LengthSequence = len(ProteinSequence) resHydrophobicity = [] reshydrophilicity = [] for i in range(LengthSequence - k): AA1 = ProteinSequence[i] AA2 = ProteinSequence[i + k] temp = _GetCorrelationFunctionForAPAAC(AA1, AA2) resHydrophobicity.append(temp[0]) reshydrophilicity.append(temp[1]) result = [] result.append(round(sum(resHydrophobicity) / (LengthSequence - k), 3)) result.append(round(sum(reshydrophilicity) / (LengthSequence - k), 3)) return result
[docs]def GetAPseudoAAC1(ProteinSequence, lamda=30, weight=0.5): """ Computing the first 20 of type II pseudo-amino acid compostion descriptors based on [_Hydrophobicity, _hydrophilicity]. """ rightpart = 0.0 for i in range(lamda): rightpart = rightpart + sum( GetSequenceOrderCorrelationFactorForAPAAC(ProteinSequence, k=i + 1) ) AAC = GetAAComposition(ProteinSequence) result = {} temp = 1 + weight * rightpart for index, char in enumerate(AALetter): result["APAAC" + str(index + 1)] = round(AAC[char] / temp, 3) return result
[docs]def GetAPseudoAAC2(ProteinSequence, lamda=30, weight=0.5): """ Computing the last lamda of type II pseudo-amino acid compostion descriptors based on (_Hydrophobicity, _hydrophilicity). """ rightpart = [] for i in range(lamda): temp = GetSequenceOrderCorrelationFactorForAPAAC(ProteinSequence, k=i + 1) rightpart.append(temp[0]) rightpart.append(temp[1]) result = {} temp = 1 + weight * sum(rightpart) for index in range(20, 20 + 2 * lamda): result["APAAC" + str(index + 1)] = round( weight * rightpart[index - 20] / temp * 100, 3 ) return result
[docs]def GetAPseudoAAC(ProteinSequence, lamda: int = 30, weight: float = 0.5): """ Computing all of type II pseudo-amino acid compostion descriptors based on the given properties. Note that the number of PAAC strongly depends on the lamda value. if lamda = 20, we can obtain 20+20=40 PAAC descriptors. The size of these values depends on the choice of lamda and weight simultaneously. Parameters ---------- ProteinSequence : str a pure protein sequence lamda : int reflects the rank of correlation and is a non-Negative integer, such as 15. Note that (1)lamda should NOT be larger than the length of input protein sequence; (2) lamda must be non-Negative integer, such as 0, 1, 2, ...; (3) when lamda =0, the output of PseAA server is the 20-D amino acid composition. weight : float is designed for the users to put weight on the additional PseAA components with respect to the conventional AA components. The user can select any value within the region from 0.05 to 0.7 for the weight factor. Returns ------- result : Dict[Any, Any] contains calculated 20+lamda PAAC descriptors Examples -------- >>> from propy.GetProteinFromUniprot import GetProteinSequence >>> protein = GetProteinSequence(ProteinID="Q9NQ39") >>> result = GetAPseudoAAC(protein) """ res: Dict[Any, Any] = {} res.update(GetAPseudoAAC1(ProteinSequence, lamda=lamda, weight=weight)) res.update(GetAPseudoAAC2(ProteinSequence, lamda=lamda, weight=weight)) return res
# Type I descriptors########################################################### # Pseudo-Amino Acid Composition descriptors#################################### # based on different properties################################################
[docs]def GetCorrelationFunction(Ri="S", Rj="D", AAP=None): """ Computing the correlation between two given amino acids using the given properties. Parameters ---------- Ri : str amino acids Rj : str amino acids AAP : List[Any] contains the properties, each of which is a dict form. Returns ------- result is the correlation value between two amino acids. Examples -------- >>> GetCorrelationFunction(Ri="S", Rj="D", AAP=_Hydrophobicity) """ if AAP is None: AAP = [] NumAAP = len(AAP) theta = 0.0 for i in range(NumAAP): temp = NormalizeEachAAP(AAP[i]) theta = theta + math.pow(temp[Ri] - temp[Rj], 2) result = round(theta / NumAAP, 3) return result
[docs]def GetSequenceOrderCorrelationFactor(ProteinSequence, k: int = 1, AAP=None): """ Computing the Sequence order correlation factor with gap equal to k based on the given properities. Parameters ---------- ProteinSequence : str a pure protein sequence k : int the gap. AAP : List[Any] contains the properties, each of which is a dict form. Returns ------- result is the correlation factor value with the gap equal to k. Examples -------- >>> from propy.GetProteinFromUniprot import GetProteinSequence >>> protein = GetProteinSequence(ProteinID="Q9NQ39") >>> result = GetSequenceOrderCorrelationFactor(protein) """ if AAP is None: AAP = [] LengthSequence = len(ProteinSequence) res = [] for i in range(LengthSequence - k): AA1 = ProteinSequence[i] AA2 = ProteinSequence[i + k] res.append(GetCorrelationFunction(AA1, AA2, AAP)) result = round(sum(res) / (LengthSequence - k), 3) return result
[docs]def GetPseudoAAC1(ProteinSequence, lamda=30, weight=0.05, AAP=None): """ Computing the first 20 of type I pseudo-amino acid compostion descriptors based on the given properties. """ if AAP is None: AAP = [] rightpart = 0.0 for i in range(lamda): rightpart = rightpart + GetSequenceOrderCorrelationFactor( ProteinSequence, i + 1, AAP ) AAC = GetAAComposition(ProteinSequence) result = {} temp = 1 + weight * rightpart for index, char in enumerate(AALetter): result["PAAC" + str(index + 1)] = round(AAC[char] / temp, 3) return result
[docs]def GetPseudoAAC2(ProteinSequence, lamda: int = 30, weight: float = 0.05, AAP=None): """ Compute the last lamda of type I pseudo-amino acid compostion descriptors based on the given properties. """ if AAP is None: AAP = [] rightpart = [] for i in range(lamda): rightpart.append(GetSequenceOrderCorrelationFactor(ProteinSequence, i + 1, AAP)) result = {} temp = 1 + weight * sum(rightpart) for index in range(20, 20 + lamda): result["PAAC" + str(index + 1)] = round( weight * rightpart[index - 20] / temp * 100, 3 ) return result
[docs]def GetPseudoAAC(ProteinSequence: str, lamda: int = 30, weight: float = 0.05, AAP=None): """ Computing all of type I pseudo-amino acid compostion descriptors based on the given properties. Note that the number of PAAC strongly depends on the lamda value. if lamda = 20, we can obtain 20+20=40 PAAC descriptors. The size of these values depends on the choice of lamda and weight simultaneously. You must specify some properties into AAP. Parameters ---------- ProteinSequence : str a pure protein sequence lamda : int reflects the rank of correlation and is a non-Negative integer, such as 15. Note that (1)lamda should NOT be larger than the length of input protein sequence; (2) lamda must be non-Negative integer, such as 0, 1, 2, ...; (3) when lamda =0, the output of PseAA server is the 20-D amino acid composition. weight : float is designed for the users to put weight on the additional PseAA components with respect to the conventional AA components. The user can select any value within the region from 0.05 to 0.7 for the weight factor. AAP : List[Any] contains the properties, each of which is a dict form. Returns ------- result is a dict form containing calculated 20+lamda PAAC descriptors. Examples -------- >>> from propy.GetProteinFromUniprot import GetProteinSequence >>> protein = GetProteinSequence(ProteinID="Q9NQ39") >>> result = GetPseudoAAC(protein) """ if AAP is None: AAP = [] res: Dict[Any, Any] = {} res.update(GetPseudoAAC1(ProteinSequence, lamda, weight, AAP)) res.update(GetPseudoAAC2(ProteinSequence, lamda, weight, AAP)) return res