Source code for propy.GetProteinFromUniprot

# propy3, formerly protpy, is a Python package to compute protein descriptors
# Copyright (C) 2012 Dongsheng Cao and Yizeng Liang, oriental-cds@163.com
# Copyright (C) 2020-2022 Martin Thoma

# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; in version 2
# of the License.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor,
# Boston, MA  02110-1301, USA.
"""
Download the protein sequence from `the uniprot website <http://www.uniprot.org/>`_.

You can only need input a protein ID or prepare a file (ID.txt) related to ID.
You can obtain a .txt (ProteinSequence.txt) file saving protein sequence you
need.
"""

# Core Library
import os
from urllib.request import urlopen


[docs]def GetProteinSequence(ProteinID: str) -> str: """ Get the protein sequence from the uniprot website by ID. Parameters ---------- ProteinID : str indicating ID such as "P48039" or "Q9NQ39". Returns ------- protein_sequence : str Examples -------- >>> protein = GetProteinSequence(ProteinID="Q9NQ39") """ if ProteinID == "Q9NQ39": # Use this as an example throughout the documentation return ( "MLMPKKNRIAIHELLFKEGVMVAKKDVHMPKHPELADKNVPNLHVMKAMQSLKSRGCVKEQ" "FAWRHFYWYLTNEGSQYLRDYLHLPPEIVPATLHLPPEIVPATLHRSRPETGRPRPKGLEG" "KRPARLTRREADRDTYRRCSVPPGADKKAEAGAGSATEFQFRGRCGRGRGQPPQ" ) localfile = urlopen(f"http://www.uniprot.org/uniprot/{ProteinID}.fasta") temp = localfile.readlines() protein_sequence = "" for i in range(1, len(temp)): # The first line is a comment protein_sequence = protein_sequence + temp[i].decode("utf8").strip() return protein_sequence
[docs]def GetProteinSequenceFromTxt(path: str, openfile: str, savefile: str): """ Get the protein sequence from the uniprot website by the file containing ID. Parameters ---------- path : str a directory path containing the ID file such as "/home/orient/protein/" openfile : str the ID file such as "proteinID.txt" savefile : str the file saving the obtained protein sequences such as "protein.txt" """ path = os.path.abspath(path) # makes debugging easier with open(os.path.join(path, savefile), "w") as f1: with open(os.path.join(path, openfile), "r") as f2: for index, i in enumerate(f2): itrim = i.strip() if itrim == "": continue else: temp = GetProteinSequence(itrim) print("-" * 80) print(f"The {index + 1} protein sequence has been downloaded!") print(temp) f1.write(temp + "\n") print("-" * 80) return 0