{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "l5TuW6z28Jdq" }, "source": [ "# Open and parse a remote multifasta file" ] }, { "cell_type": "markdown", "metadata": { "id": "XCiKVC1zO3Cs" }, "source": [ "## The multifasta file is compressed with gzip\n", "\n", "### The directory that contains the multifasta file\n", "[https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Viruses/UP000464024/](https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Viruses/UP000464024/)\n", "\n", "**Note**: Not the case, but if the multifasta has a space-line between sequences, Biopython takes care of it!\n", "\n", "### See the multifasta uncompressed\n", "[https://cbdm-01.zdv.uni-mainz.de/~muro/teaching/data-teaching/ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Viruses/UP000464024/UP000464024_2697049.fasta](https://cbdm-01.zdv.uni-mainz.de/~muro/teaching/data-teaching/ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Viruses/UP000464024/UP000464024_2697049.fasta)\n", "\n", "The compressed file is in the same directory in UP000464024_2697049.fasta.gz" ] }, { "cell_type": "markdown", "metadata": { "id": "z1OM1VQ48VVA" }, "source": [ "## Install and import Biopython" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 9326, "status": "ok", "timestamp": 1731487236970, "user": { "displayName": "Enrique M. Muro", "userId": "08274162213514611628" }, "user_tz": -60 }, "id": "2QZgWpUXdo_I", "outputId": "8c0cab06-648b-4afa-e9b1-0ffdd958a479" }, "outputs": [], "source": [ "# Install and import from biopython\n", "if 0:\n", " try:\n", " #import google.colab\n", " # Running on Google Colab, so install Biopython first\n", " !pip install Bio # biopython\n", " except ImportError:\n", " pass\n", "\n", "from Bio import SeqIO\n", "import requests # HTTP requests\n", "import gzip\n", "from io import BytesIO\n", "import sys" ] }, { "cell_type": "markdown", "metadata": { "id": "HymjvjIb85ED" }, "source": [ "## Retrieve the multifasta. From internet if it is not available locally\n", "\n", "## Parse the file.\n", "- Iterate all over the sequences\n", "- Show the first sequence the descriptor, sequence and calculate the sequence length\n", "- Count the number of sequences and the total number of amino acids in the proteome\n", "\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 1802, "status": "ok", "timestamp": 1731487416639, "user": { "displayName": "Enrique M. Muro", "userId": "08274162213514611628" }, "user_tz": -60 }, "id": "0yNeWVk-iZms", "outputId": "ccdbd50a-663a-46f7-dd89-7961d6c2a3fd" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "https://cbdm-01.zdv.uni-mainz.de/~muro/teaching/data-teaching/ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Viruses/UP000464024/UP000464024_2697049.fasta.gz\n", "\n", "SeqRecord(seq=Seq('MGYINVFAFPFTIYSLLLCRMNSRNYIAQVDVVNFNLT'), id='sp|A0A663DJA2|ORF10_SARS2', name='sp|A0A663DJA2|ORF10_SARS2', description='sp|A0A663DJA2|ORF10_SARS2 Putative ORF10 protein OS=Severe acute respiratory syndrome coronavirus 2 OX=2697049 GN=ORF10 PE=5 SV=1', dbxrefs=[])\n", "\n", "Descripción: sp|A0A663DJA2|ORF10_SARS2 Putative ORF10 protein OS=Severe acute respiratory syndrome coronavirus 2 OX=2697049 GN=ORF10 PE=5 SV=1\n", "ID: sp|A0A663DJA2|ORF10_SARS2\n", "Uniprot accession number: A0A663DJA2\n", "Sequence: MGYINVFAFPFTIYSLLLCRMNSRNYIAQVDVVNFNLT\n", "Results:\n", "The fasta file has a total of 17 sequences\n", "Total number of aa in the proteome: 14439 aa\n", "Average length of a protein: 849.3529411764706 aa\n" ] } ], "source": [ "# URL of the gzip fasta file\n", "#url = \"https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640/UP000005640_9606.fasta.gz\"\n", "url = \"https://cbdm-01.zdv.uni-mainz.de/~muro/teaching/data-teaching/ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Viruses/UP000464024/UP000464024_2697049.fasta.gz\"\n", "print(url)\n", "\n", "# load the gzip fasta file\n", "response = requests.get(url)\n", "# Was the request fine?\n", "if response.status_code == 200:\n", " # load it in BytesIO. It will be like a \"file on memory\"\n", " gzip_io = BytesIO(response.content)\n", "\n", " # Descompress and read\n", " num_of_seqs = 0\n", " aa_in_proteome = 0\n", " with gzip.open(gzip_io, \"rt\") as fasta_io: # rt: read in text mode\n", " for record in SeqIO.parse(fasta_io, \"fasta\"):\n", " num_of_seqs += 1 # increase the seq counter\n", " if num_of_seqs == 1: # just for displaying info on the first sequence\n", " print(type(record))\n", " print(repr(record)) # repr provides a printable from an object\n", " #\n", " print(f\"\\nDescripción: {record.description}\")\n", " print(f\"ID: {record.id}\") # print the accession number from Uniprot\n", " acc_num = record.id # get the fasta descriptor\n", " acc_num = acc_num.split('|')[1]\n", " print(f\"Uniprot accession number: {acc_num}\") # print the accession number from Uniprot\n", " print(f\"Sequence: {record.seq}\")\n", "\n", " aa_in_proteome += len(record.seq)\n", "else:\n", " print(f\"Couldn't get the file. HTTP status: {response.status_code}\")\n", "\n", "# results\n", "print(\"Results:\")\n", "print(\"The fasta file has a total of\", num_of_seqs, \"sequences\")\n", "print(\"Total number of aa in the proteome: \", aa_in_proteome, \"aa\")\n", "print(\"Average length of a protein: \", aa_in_proteome/num_of_seqs, \"aa\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": { "id": "OHyuC84ZVCs6" }, "source": [ "### Access to the 4th sequence in the multifasta" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 707, "status": "ok", "timestamp": 1731487430671, "user": { "displayName": "Enrique M. Muro", "userId": "08274162213514611628" }, "user_tz": -60 }, "id": "hdi9uoyEhlac", "outputId": "c646d61f-1ca9-4747-d493-69db2fec8f13" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The sequence number 4 is:\n", "\n", "seq id:\t sp|P0DTC3|AP3A_SARS2\n", "seq:\t MDLFMRIFTIGTVTLKQGEIKDATPSDFVRATATIPIQASLPFGWLIVGVALLAVFQSASKIITLKKRWQLALSKGVHFVCNLLLLFVTVYSHLLLVAAGLEAPFLYLYALVYFLQSINFVRIIMRLWLCWKCRSKNPLLYDANYFLCWHTNCYDYCIPYNSVTSSIVITSGDGTTSPISEHDYQIGGYTEKWESGVKDCVVLHSYFTSDYYQLYSTQLSTDTGVEHVTFFIYNKIVDEPEEHVQIHTIDGSSGVVNPVMEPIYDEPTTTTSVPL\n", "seq len: 275 \n", "\n" ] } ], "source": [ "gzip_io = BytesIO(response.content)\n", "with gzip.open(gzip_io, \"rt\") as fasta_io: # rt: read in text mode it is need\n", " my_parsed_seqio_l = list(SeqIO.parse(fasta_io, \"fasta\"))\n", " selected_seq = my_parsed_seqio_l[3]\n", " print(\"The sequence number 4 is:\", \"\\n\", type(selected_seq), sep=\"\") # starting from 0\n", " print(\"seq id:\\t\", selected_seq.id)\n", " selected_seq = selected_seq.seq\n", " print(\"seq:\\t\", selected_seq)\n", " print(\"seq len:\", len(selected_seq), \"\\n\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": { "id": "OHyuC84ZVCs6" }, "source": [ "### Access to a sequence given a Uniprot accession number (ie. P0DTC2)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 707, "status": "ok", "timestamp": 1731487430671, "user": { "displayName": "Enrique M. Muro", "userId": "08274162213514611628" }, "user_tz": -60 }, "id": "hdi9uoyEhlac", "outputId": "c646d61f-1ca9-4747-d493-69db2fec8f13" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accession: sp|P0DTC3|AP3A_SARS2\n", "Description: sp|P0DTC3|AP3A_SARS2 ORF3a protein OS=Severe acute respiratory syndrome coronavirus 2 OX=2697049 GN=3a PE=1 SV=1\n", "Sequence: MDLFMRIFTIGTVTLKQGEIKDATPSDFVRATATIPIQASLPFGWLIVGVALLAVFQSASKIITLKKRWQLALSKGVHFVCNLLLLFVTVYSHLLLVAAGLEAPFLYLYALVYFLQSINFVRIIMRLWLCWKCRSKNPLLYDANYFLCWHTNCYDYCIPYNSVTSSIVITSGDGTTSPISEHDYQIGGYTEKWESGVKDCVVLHSYFTSDYYQLYSTQLSTDTGVEHVTFFIYNKIVDEPEEHVQIHTIDGSSGVVNPVMEPIYDEPTTTTSVPL\n", "Sequence length: 275 aa\n" ] } ], "source": [ "def get_sequence_by_accession(records, accession):\n", " # Iterate over each sequence from the multifasta\n", " for record in records:\n", " if accession in record.id: # is the accession in the id of the entry?\n", " return record\n", " return None # no accession found\n", "\n", "def get_sequence_by_gene_name(records, key):\n", " # Iterate over each sequence from the multifasta\n", " for record in records:\n", " if key in record.description: # is the accession in the id of the entry?\n", " return record\n", " return None # no accession found\n", "\n", "KEY = \"P0DTC3\" #\"42858\" #\"69905\" #\"P60484\" # accession\n", "#KEY = \"TOP1\" #\"ADH4\" #\"PDGFB\" \"PTEN\" #\"HBA\" # gene name\n", "#\n", "gzip_io = BytesIO(response.content)\n", "with gzip.open(gzip_io, \"rt\") as fasta_io: # rt: read in text mode it is need\n", " records = SeqIO.parse(fasta_io, \"fasta\")\n", " record = get_sequence_by_accession(records, KEY)\n", " #record = get_sequence_by_gene_name(records, KEY)\n", " \n", " if record:\n", " print(f\"Accession: {record.id}\")\n", " print(f\"Description: {record.description}\")\n", " print(f\"Sequence: {record.seq}\")\n", " print(f\"Sequence length: {len(record.seq)} aa\")\n", " else:\n", " print(\"Key (accession not found in the multifasta.\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "colab": { "authorship_tag": "ABX9TyOClWfxv9E2hjAN2EMkU+jd", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.9" } }, "nbformat": 4, "nbformat_minor": 4 }