Source code for propy.GetProteinFromUniprot

# propy3, formerly protpy, is a Python package to compute protein descriptors
# Copyright (C) 2012 Dongsheng Cao and Yizeng Liang, oriental-cds@163.com
# Copyright (C) 2020-2022 Martin Thoma

# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; in version 2
# of the License.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor,
# Boston, MA  02110-1301, USA.
"""
Download the protein sequence from `the uniprot website <http://www.uniprot.org/>`_.

You can only need input a protein ID or prepare a file (ID.txt) related to ID.
You can obtain a .txt (ProteinSequence.txt) file saving protein sequence you
need.
"""

# Core Library
import os
from urllib.request import urlopen


[docs]def GetProteinSequence(ProteinID: str) -> str:
    """
    Get the protein sequence from the uniprot website by ID.

    Parameters
    ----------
    ProteinID : str
        indicating ID such as "P48039" or "Q9NQ39".

    Returns
    -------
    protein_sequence : str

    Examples
    --------
    >>> protein = GetProteinSequence(ProteinID="Q9NQ39")
    """
    if ProteinID == "Q9NQ39":
        # Use this as an example throughout the documentation
        return (
            "MLMPKKNRIAIHELLFKEGVMVAKKDVHMPKHPELADKNVPNLHVMKAMQSLKSRGCVKEQ"
            "FAWRHFYWYLTNEGSQYLRDYLHLPPEIVPATLHLPPEIVPATLHRSRPETGRPRPKGLEG"
            "KRPARLTRREADRDTYRRCSVPPGADKKAEAGAGSATEFQFRGRCGRGRGQPPQ"
        )
    localfile = urlopen(f"http://www.uniprot.org/uniprot/{ProteinID}.fasta")
    temp = localfile.readlines()
    protein_sequence = ""
    for i in range(1, len(temp)):  # The first line is a comment
        protein_sequence = protein_sequence + temp[i].decode("utf8").strip()
    return protein_sequence


[docs]def GetProteinSequenceFromTxt(path: str, openfile: str, savefile: str):
    """
    Get the protein sequence from the uniprot website by the file containing ID.

    Parameters
    ----------
    path : str
        a directory path containing the ID file such as "/home/orient/protein/"
    openfile : str
        the ID file such as "proteinID.txt"
    savefile : str
        the file saving the obtained protein sequences such as "protein.txt"
    """
    path = os.path.abspath(path)  # makes debugging easier
    with open(os.path.join(path, savefile), "w") as f1:
        with open(os.path.join(path, openfile), "r") as f2:
            for index, i in enumerate(f2):
                itrim = i.strip()
                if itrim == "":
                    continue
                else:
                    temp = GetProteinSequence(itrim)
                    print("-" * 80)
                    print(f"The {index + 1} protein sequence has been downloaded!")
                    print(temp)
                    f1.write(temp + "\n")
                    print("-" * 80)
    return 0