# propy3, formerly protpy, is a Python package to compute protein descriptors
# Copyright (C) 2012 Dongsheng Cao and Yizeng Liang, oriental-cds@163.com
# Copyright (C) 2020-2022 Martin Thoma
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; in version 2
# of the License.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor,
# Boston, MA 02110-1301, USA.
"""
Compute the quasi sequence order descriptors based on the given protein
sequence. We can obtain two types of descriptors: Sequence-order-coupling
number and quasi-sequence-order descriptors. Two distance matrixes between 20
amino acids are employed.
References
----------
.. [1] Kuo-Chen Chou. Prediction of Protein Subcellar Locations by Incorporating
Quasi-Sequence-Order Effect. Biochemical and Biophysical Research
Communications 2000, 278, 477-483.
.. [2] Kuo-Chen Chou and Yu-Dong Cai. Prediction of Protein sucellular locations by
GO-FunD-PseAA predictor, Biochemical and Biophysical Research
Communications, 2004, 320, 1236-1239.
.. [3] Gisbert Schneider and Paul wrede. The Rational Design of Amino Acid
Sequences by Artifical Neural Networks and Simulated Molecular
Evolution: Do Novo Design of an Idealized Leader Cleavge Site. Biophys
Journal, 1994, 66, 335-344.
"""
# Core Library
import json
import math
from typing import Any, Dict
# Third party
from pkg_resources import resource_filename
# First party
from propy import AALetter
# Distance is the Schneider-Wrede physicochemical distance matrix
# used by Chou et. al.
filepath = resource_filename(
__name__, "data/schneider-wrede-physicochemical-distance-matrix.json"
)
with open(filepath, "r") as f:
_Distance1: Dict[str, float] = json.load(f)
# Distance is the Grantham chemical distance matrix used by Grantham et. al.
filepath = resource_filename(__name__, "data/grantham-chemical-distance-matrix.json")
with open(filepath, "r") as f:
_Distance2: Dict[str, int] = json.load(f)
[docs]def GetSequenceOrderCouplingNumber(
ProteinSequence: str, d: int = 1, distancematrix: Dict[str, float] = _Distance1
):
"""
Compute the dth-rank sequence order coupling number for a protein.
Parameters
----------
ProteinSequence : str
a pure protein sequence
d : int
the gap between two amino acids.
distancematrix : Dict[str, float]
Returns
-------
tau : float
Example
-------
>>> from propy.GetProteinFromUniprot import GetProteinSequence
>>> protein = GetProteinSequence(ProteinID="Q9NQ39")
>>> result = GetSequenceOrderCouplingNumber(protein)
"""
NumProtein = len(ProteinSequence)
tau = 0.0
for i in range(NumProtein - d):
temp1 = ProteinSequence[i]
temp2 = ProteinSequence[i + d]
tau = tau + math.pow(distancematrix[temp1 + temp2], 2)
return round(tau, 3)
[docs]def GetSequenceOrderCouplingNumberp(
ProteinSequence: str, maxlag: int = 30, distancematrix: Dict[Any, Any] = None
):
"""
Compute the sequence order coupling numbers from 1 to maxlag
for a given protein sequence based on the user-defined property.
Parameters
----------
ProteinSequence : str
a pure protein sequence
maxlag : int, optional (default: 30)
the maximum lag and the length of the protein should be larger
than maxlag.
distancematrix : Dict[Any, Any]
contains 400 distance values
Returns
-------
Tau : Dict[str]
contains all sequence order coupling numbers based on the given property
Examples
--------
>>> from propy.GetProteinFromUniprot import GetProteinSequence
>>> protein = GetProteinSequence(ProteinID="Q9NQ39")
>>> result = GetSequenceOrderCouplingNumberp(protein)
"""
if distancematrix is None:
distancematrix = {}
Tau = {}
for i in range(maxlag):
Tau["tau" + str(i + 1)] = GetSequenceOrderCouplingNumber(
ProteinSequence, i + 1, distancematrix
)
return Tau
[docs]def GetSequenceOrderCouplingNumberSW(
ProteinSequence: str, maxlag: int = 30, distancematrix=_Distance1
):
"""
Compute the sequence order coupling numbers from 1 to maxlag for a given
protein sequence based on the Schneider-Wrede physicochemical distance
matrix.
Parameters
----------
ProteinSequence : str
a pure protein sequence
maxlag : int, optional (default: 30)
the maximum lag and the length of the protein should be larger than
maxlag
distancematrix : Dict[Any, Any]
contains Schneider-Wrede physicochemical distance matrix
Returns
-------
Tau : Dict[Any, Any]
contains all sequence order coupling numbers based on the
Schneider-Wrede physicochemical distance matrix
Examples
--------
>>> from propy.GetProteinFromUniprot import GetProteinSequence
>>> protein = GetProteinSequence(ProteinID="Q9NQ39")
>>> result = GetSequenceOrderCouplingNumberSW(protein)
"""
Tau = {}
for i in range(maxlag):
Tau["tausw" + str(i + 1)] = GetSequenceOrderCouplingNumber(
ProteinSequence, i + 1, distancematrix
)
return Tau
[docs]def GetSequenceOrderCouplingNumberGrant(
ProteinSequence: str, maxlag: int = 30, distancematrix=_Distance2
):
"""
Compute the sequence order coupling numbers from 1 to maxlag for a given
protein sequence based on the Grantham chemical distance matrix.
Parameters
----------
ProteinSequence : str
a pure protein sequence
maxlag : int, optional (default: 30)
the maximum lag and the length of the protein should be larger than
maxlag
distancematrix : Dict[Any, Any]
contains Schneider-Wrede physicochemical distance matrix
Returns
-------
Tau : Dict[Any, Any]
contains all sequence order coupling numbers based on the Grantham
chemical distance matrix
Examples
--------
>>> from propy.GetProteinFromUniprot import GetProteinSequence
>>> protein = GetProteinSequence(ProteinID="Q9NQ39")
>>> result = GetSequenceOrderCouplingNumberGrant(protein)
"""
Tau = {}
for i in range(maxlag):
Tau["taugrant" + str(i + 1)] = GetSequenceOrderCouplingNumber(
ProteinSequence, i + 1, distancematrix
)
return Tau
[docs]def GetSequenceOrderCouplingNumberTotal(
ProteinSequence: str, maxlag: int = 30
) -> Dict[Any, Any]:
"""
Compute the sequence order coupling numbers from 1 to maxlag for a given
protein sequence.
Parameters
----------
ProteinSequence : str
a pure protein sequence
maxlag : int, optional (default: 30)
the maximum lag and the length of the protein should be larger
Returns
-------
result : Dict
contains all sequence order coupling numbers
Examples
--------
>>> from propy.GetProteinFromUniprot import GetProteinSequence
>>> protein = GetProteinSequence(ProteinID="Q9NQ39")
>>> result = GetSequenceOrderCouplingNumberTotal(protein)
"""
Tau: Dict[Any, Any] = {}
Tau.update(GetSequenceOrderCouplingNumberSW(ProteinSequence, maxlag=maxlag))
Tau.update(GetSequenceOrderCouplingNumberGrant(ProteinSequence, maxlag=maxlag))
return Tau
[docs]def GetAAComposition(ProteinSequence: str) -> Dict[str, float]:
"""
Calculate the composition of Amino acids for a given protein sequence.
Parameters
----------
ProteinSequence : str
a pure protein sequence
Returns
-------
result : Dict[str, float]
contains the composition of 20 amino acids.
Examples
--------
>>> from propy.GetProteinFromUniprot import GetProteinSequence
>>> from propy.AAComposition import CalculateAAComposition
>>> protein = GetProteinSequence(ProteinID="Q9NQ39")
>>> result = CalculateAAComposition(protein)
"""
LengthSequence = len(ProteinSequence)
result: Dict[str, float] = {}
for i in AALetter:
result[i] = round(float(ProteinSequence.count(i)) / LengthSequence, 3)
return result
[docs]def GetQuasiSequenceOrder1(
ProteinSequence: str, maxlag: int = 30, weight: float = 0.1, distancematrix=None
):
"""
Compute the first 20 quasi-sequence-order descriptors for a given protein
sequence.
Parameters
----------
ProteinSequence : str
a pure protein sequence
maxlag : int, optional (default: 30)
the maximum lag and the length of the protein should be larger
Examples
--------
>>> from propy.GetProteinFromUniprot import GetProteinSequence
>>> protein = GetProteinSequence(ProteinID="Q9NQ39")
>>> result = GetQuasiSequenceOrder1(protein)
see :py:func:`GetQuasiSequenceOrder` for the choice of parameters.
"""
if distancematrix is None:
distancematrix = {}
rightpart = 0.0
for i in range(maxlag):
rightpart = rightpart + GetSequenceOrderCouplingNumber(
ProteinSequence, i + 1, distancematrix
)
AAC = GetAAComposition(ProteinSequence)
result: Dict[str, float] = {}
temp = 1 + weight * rightpart
for index, aaletter_char in enumerate(AALetter):
result["QSO" + str(index + 1)] = round(AAC[aaletter_char] / temp, 6)
return result
[docs]def GetQuasiSequenceOrder2(
ProteinSequence: str, maxlag=30, weight=0.1, distancematrix=None
):
"""
Compute the last maxlag quasi-sequence-order descriptors for a given
protein sequence.
Parameters
----------
ProteinSequence : str
a pure protein sequence
maxlag : int, optional (default: 30)
the maximum lag and the length of the protein should be larger
Examples
--------
>>> from propy.GetProteinFromUniprot import GetProteinSequence
>>> protein = GetProteinSequence(ProteinID="Q9NQ39")
>>> result = GetQuasiSequenceOrder2(protein)
see :py:func:`GetQuasiSequenceOrder` for the choice of parameters.
"""
if distancematrix is None:
distancematrix = {}
rightpart = []
for i in range(maxlag):
rightpart.append(
GetSequenceOrderCouplingNumber(ProteinSequence, i + 1, distancematrix)
)
result = {}
temp = 1 + weight * sum(rightpart)
for index in range(20, 20 + maxlag):
result["QSO" + str(index + 1)] = round(weight * rightpart[index - 20] / temp, 6)
return result
[docs]def GetQuasiSequenceOrder1SW(
ProteinSequence: str, maxlag=30, weight=0.1, distancematrix=_Distance1
):
"""
Compute the first 20 quasi-sequence-order descriptors for a given protein
sequence.
Parameters
----------
ProteinSequence : str
a pure protein sequence
maxlag : int, optional (default: 30)
the maximum lag and the length of the protein should be larger
Examples
--------
>>> from propy.GetProteinFromUniprot import GetProteinSequence
>>> protein = GetProteinSequence(ProteinID="Q9NQ39")
>>> result = GetQuasiSequenceOrder1SW(protein)
see :py:func:`GetQuasiSequenceOrder` for the choice of parameters.
"""
rightpart = 0.0
for i in range(maxlag):
rightpart = rightpart + GetSequenceOrderCouplingNumber(
ProteinSequence, i + 1, distancematrix
)
AAC = GetAAComposition(ProteinSequence)
result = {}
temp = 1 + weight * rightpart
for index, aaletter_char in enumerate(AALetter):
result["QSOSW" + str(index + 1)] = round(AAC[aaletter_char] / temp, 6)
return result
[docs]def GetQuasiSequenceOrder2SW(
ProteinSequence: str, maxlag=30, weight=0.1, distancematrix=_Distance1
):
"""
Compute the last maxlag quasi-sequence-order descriptors for a given
protein sequence.
Parameters
----------
ProteinSequence : str
a pure protein sequence
maxlag : int, optional (default: 30)
the maximum lag and the length of the protein should be larger
Examples
--------
>>> from propy.GetProteinFromUniprot import GetProteinSequence
>>> protein = GetProteinSequence(ProteinID="Q9NQ39")
>>> result = GetQuasiSequenceOrder2SW(protein)
see :py:func:`GetQuasiSequenceOrder` for the choice of parameters.
"""
rightpart = []
for i in range(maxlag):
rightpart.append(
GetSequenceOrderCouplingNumber(ProteinSequence, i + 1, distancematrix)
)
result = {}
temp = 1 + weight * sum(rightpart)
for index in range(20, 20 + maxlag):
result["QSOSW" + str(index + 1)] = round(
weight * rightpart[index - 20] / temp, 6
)
return result
[docs]def GetQuasiSequenceOrder1Grant(
ProteinSequence: str,
maxlag: int = 30,
weight: float = 0.1,
distancematrix=_Distance2,
):
"""
Compute the first 20 quasi-sequence-order descriptors for a given protein
sequence.
Parameters
----------
ProteinSequence : str
a pure protein sequence
maxlag : int, optional (default: 30)
the maximum lag and the length of the protein should be larger
Examples
--------
>>> from propy.GetProteinFromUniprot import GetProteinSequence
>>> protein = GetProteinSequence(ProteinID="Q9NQ39")
>>> result = GetQuasiSequenceOrder1Grant(protein)
see :py:func:`GetQuasiSequenceOrder` for the choice of parameters.
"""
rightpart = 0.0
for i in range(maxlag):
rightpart = rightpart + GetSequenceOrderCouplingNumber(
ProteinSequence, i + 1, distancematrix
)
AAC = GetAAComposition(ProteinSequence)
result = {}
temp = 1 + weight * rightpart
for index, aaletter_char in enumerate(AALetter):
result["QSOgrant" + str(index + 1)] = round(AAC[aaletter_char] / temp, 6)
return result
[docs]def GetQuasiSequenceOrder2Grant(
ProteinSequence: str,
maxlag: int = 30,
weight: float = 0.1,
distancematrix=_Distance2,
):
"""
Compute the last maxlag quasi-sequence-order descriptors for a given
protein sequence.
Parameters
----------
ProteinSequence : str
a pure protein sequence
maxlag : int, optional (default: 30)
the maximum lag and the length of the protein should be larger
Examples
--------
>>> from propy.GetProteinFromUniprot import GetProteinSequence
>>> protein = GetProteinSequence(ProteinID="Q9NQ39")
>>> result = GetQuasiSequenceOrder2Grant(protein)
see :py:func:`GetQuasiSequenceOrder` for the choice of parameters.
"""
rightpart = []
for i in range(maxlag):
rightpart.append(
GetSequenceOrderCouplingNumber(ProteinSequence, i + 1, distancematrix)
)
result = {}
temp = 1 + weight * sum(rightpart)
for index in range(20, 20 + maxlag):
result["QSOgrant" + str(index + 1)] = round(
weight * rightpart[index - 20] / temp, 6
)
return result
[docs]def GetQuasiSequenceOrder(
ProteinSequence: str, maxlag: int = 30, weight: float = 0.1
) -> Dict[Any, Any]:
"""
Compute quasi-sequence-order descriptors for a given protein.
See [1]_ for details.
Parameters
----------
ProteinSequence : str
a pure protein sequence
maxlag : int, optional (default: 30)
the maximum lag and the length of the protein should be larger than
maxlag
weight : float, optional (default: 0.1)
a weight factor. Please see reference 1 for its choice.
Returns
-------
result : Dict
contains all quasi-sequence-order descriptors
Examples
--------
>>> from propy.GetProteinFromUniprot import GetProteinSequence
>>> protein = GetProteinSequence(ProteinID="Q9NQ39")
>>> result = GetQuasiSequenceOrder(protein)
"""
result: Dict[Any, Any] = {}
result.update(GetQuasiSequenceOrder1SW(ProteinSequence, maxlag, weight, _Distance1))
result.update(GetQuasiSequenceOrder2SW(ProteinSequence, maxlag, weight, _Distance1))
result.update(
GetQuasiSequenceOrder1Grant(ProteinSequence, maxlag, weight, _Distance2)
)
result.update(
GetQuasiSequenceOrder2Grant(ProteinSequence, maxlag, weight, _Distance2)
)
return result
[docs]def GetQuasiSequenceOrderp(
ProteinSequence: str,
maxlag: int = 30,
weight: float = 0.1,
distancematrix: Dict[Any, Any] = None,
) -> Dict[Any, Any]:
"""
Compute quasi-sequence-order descriptors for a given protein.
See [1]_ for details.
Parameters
----------
ProteinSequence : str
a pure protein sequence
maxlag : int, optional (default: 30)
the maximum lag and the length of the protein should be larger than
maxlag
weight : float, optional (default: 0.1)
a weight factor. Please see reference 1 for its choice.
distancematrix : Dict[Any, Any]
contains 400 distance values
Returns
-------
result : Dict[Any, Any]
contains all quasi-sequence-order descriptors
Examples
--------
>>> from propy.GetProteinFromUniprot import GetProteinSequence
>>> protein = GetProteinSequence(ProteinID="Q9NQ39")
>>> result = GetQuasiSequenceOrderp(protein)
"""
if distancematrix is None:
distancematrix = {}
result: Dict[Any, Any] = {}
result.update(
GetQuasiSequenceOrder1(ProteinSequence, maxlag, weight, distancematrix)
)
result.update(
GetQuasiSequenceOrder2(ProteinSequence, maxlag, weight, distancematrix)
)
return result