{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "l5TuW6z28Jdq" }, "source": [ "# Open and parse a remote multifasta file" ] }, { "cell_type": "markdown", "metadata": { "id": "XCiKVC1zO3Cs" }, "source": [ "## The multifasta file compressed with gzip\n", "\n", "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Viruses/UP000464024/UP000464024_2697049.fasta.gz\n", "\n", "### The directory that contains the multifasta file\n", "[https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Viruses/UP000464024](https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Viruses/UP000464024)\n", "\n", "**Note**: if the multifasta have a space-line between sequences, Biopython takes care of it!\n", "\n", "### See the multifasta @\n", "\n", "[https://cbdm-01.zdv.uni-mainz.de/~muro/teaching/data-teaching/ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Viruses/UP000464024/UP000464024_2697049.fasta](https://cbdm-01.zdv.uni-mainz.de/~muro/teaching/data-teaching/ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Viruses/UP000464024/UP000464024_2697049.fasta)" ] }, { "cell_type": "markdown", "metadata": { "id": "z1OM1VQ48VVA" }, "source": [ "## Install and import Biopython" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 9326, "status": "ok", "timestamp": 1731487236970, "user": { "displayName": "Enrique M. Muro", "userId": "08274162213514611628" }, "user_tz": -60 }, "id": "2QZgWpUXdo_I", "outputId": "8c0cab06-648b-4afa-e9b1-0ffdd958a479" }, "outputs": [], "source": [ "# First install biopython if needed\n", "if 0: # only in colab not in the JGU server\n", " try:\n", " #import google.colab\n", " # Running on Google Colab, so install Biopython first\n", " !pip install Bio # biopython\n", " except ImportError:\n", " pass\n", "\n", "# import the modules\n", "from Bio import SeqIO\n", "import requests\n", "import gzip\n", "from io import BytesIO\n", "import sys" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Amino acids (mass in Da)\n", "### Note \n", "1 Da = 1.660539 $10^{−27}$ Kg" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## The mass of the different amino acids\n", "this data structure is a dictionary" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "amino_acid_masses = {\n", " \"A\": 89.1, # Alanine\n", " \"R\": 174.2, # Arginine\n", " \"N\": 132.1, # Asparagine\n", " \"D\": 133.1, # Aspartic acid\n", " \"C\": 121.2, # Cysteine\n", " \"Q\": 146.2, # Glutamine\n", " \"E\": 147.1, # Glutamic acid\n", " \"G\": 75.1, # Glycine\n", " \"H\": 155.2, # Histidine\n", " \"I\": 131.2, # Isoleucine\n", " \"L\": 131.2, # Leucine\n", " \"K\": 146.2, # Lysine\n", " \"M\": 149.2, # Methionine\n", " \"F\": 165.2, # Phenylalanine\n", " \"P\": 115.1, # Proline\n", " \"S\": 105.1, # Serine\n", " \"T\": 119.1, # Threonine\n", " \"W\": 204.2, # Tryptophan\n", " \"Y\": 181.2, # Tyrosine\n", " \"V\": 117.1 # Valine\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Calculate the weight of a molecule of water" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The molecular weight of one molecule of water is 18.016 Da.\n" ] } ], "source": [ "# Atomic weights in g/mol\n", "hydrogen_weight = 1.008 # g/mol\n", "oxygen_weight = 16.00 # g/mol\n", "\n", "# Calculate molecular weight of water (H2O)\n", "water_weight = (2 * hydrogen_weight) + oxygen_weight\n", "\n", "# Output the result in Da (1 g/mol = 1 Da)\n", "print(f\"The molecular weight of one molecule of water is {water_weight} Da.\")\n" ] }, { "cell_type": "markdown", "metadata": { "id": "HymjvjIb85ED" }, "source": [ "## Retrieve the multifasta of the proteome (from internet in this case)\n", "and process it.\n", "\n", "## Parse the file.\n", "- Iterate all over the sequences\n", "- calculate in each sequence\n", "\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 1802, "status": "ok", "timestamp": 1731487416639, "user": { "displayName": "Enrique M. Muro", "userId": "08274162213514611628" }, "user_tz": -60 }, "id": "0yNeWVk-iZms", "outputId": "ccdbd50a-663a-46f7-dd89-7961d6c2a3fd" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "https://cbdm-01.zdv.uni-mainz.de/~muro/teaching/data-teaching/ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Viruses/UP000464024/UP000464024_2697049.fasta.gz\n", "Results:\n", "The proteome has a total of 17 proteins\n", "Total number of aa in the proteome: 14439 aa\n", "Average length of a protein: 849.3529411764706 aa\n", "Total mass of the proteome: 1613005.8839999083 Da\n", "Average aa mass in this proteome (including the water molecule that is not in each dipeptide): 129.68033104785883 Da\n", "Average aa mass in this proteome (standard): 111.71174485767078 Da\n" ] } ], "source": [ "# URL of the gzip fasta file\n", "# sars2\n", "url = \"https://cbdm-01.zdv.uni-mainz.de/~muro/teaching/data-teaching/ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Viruses/UP000464024/UP000464024_2697049.fasta.gz\"\n", "print(url)\n", "\n", "# load the gzip fasta file\n", "response = requests.get(url)\n", "# Was the request fine?\n", "if response.status_code == 200:\n", " # load it in BytesIO. It will be kindy of a \"file on memory\"\n", " gzip_io = BytesIO(response.content)\n", "\n", " # Descompress, read each sequence and calculate \n", " num_of_seqs = 0\n", " aa_in_proteome = 0\n", " proteome_mass_with_water_molecule = 0 # the water molecule is not in the polypeptide\n", " proteome_mass = 0 \n", " with gzip.open(gzip_io, \"rt\") as fasta_io: # rt: read in text mode it is need\n", " for record in SeqIO.parse(fasta_io, \"fasta\"):\n", " num_of_seqs += 1 # increase the seq counter\n", " for aa in record.seq:\n", " if num_of_seqs >1: # only remove the molecule of water when bounding a new aa\n", " proteome_mass = proteome_mass + amino_acid_masses.get(aa, 0.0) - water_weight # improve in case of unknown aa!\n", " else:\n", " proteome_mass = proteome_mass + amino_acid_masses.get(aa, 0.0) # 1st aa\n", " proteome_mass_with_water_molecule = proteome_mass_with_water_molecule + amino_acid_masses.get(aa, 0.0) \n", " aa_in_proteome += len(record.seq)\n", "else:\n", " print(f\"Couldn't get the file. HTTP status: {response.status_code}\")\n", "\n", "# results\n", "print(\"Results:\")\n", "print(\"The proteome has a total of\", num_of_seqs, \"proteins\")\n", "print(\"Total number of aa in the proteome: \", aa_in_proteome, \"aa\")\n", "print(f\"Average length of a protein: {aa_in_proteome/num_of_seqs} aa\")\n", "print(f\"Total mass of the proteome: {proteome_mass} Da\")\n", "print(f\"Average aa mass in this proteome (including the water molecule that is not in each dipeptide): {proteome_mass_with_water_molecule/aa_in_proteome} Da\")\n", "print(f\"Average aa mass in this proteome (standard): {proteome_mass/aa_in_proteome} Da\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "colab": { "authorship_tag": "ABX9TyOClWfxv9E2hjAN2EMkU+jd", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.7" } }, "nbformat": 4, "nbformat_minor": 4 }