{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "l5TuW6z28Jdq"
   },
   "source": [
    "# Open and parse a remote multifasta file"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "XCiKVC1zO3Cs"
   },
   "source": [
    "## The multifasta file compressed with gzip\n",
    "\n",
    "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Viruses/UP000464024/UP000464024_2697049.fasta.gz\n",
    "\n",
    "### The directory that contains the multifasta file\n",
    "[https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Viruses/UP000464024](https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Viruses/UP000464024)\n",
    "\n",
    "**Note**: if the multifasta have a space-line between sequences, Biopython takes care of it!\n",
    "\n",
    "### See the multifasta @\n",
    "\n",
    "[https://cbdm-01.zdv.uni-mainz.de/~muro/teaching/data-teaching/ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Viruses/UP000464024/UP000464024_2697049.fasta](https://cbdm-01.zdv.uni-mainz.de/~muro/teaching/data-teaching/ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Viruses/UP000464024/UP000464024_2697049.fasta)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "z1OM1VQ48VVA"
   },
   "source": [
    "## Install and import Biopython"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 9326,
     "status": "ok",
     "timestamp": 1731487236970,
     "user": {
      "displayName": "Enrique M. Muro",
      "userId": "08274162213514611628"
     },
     "user_tz": -60
    },
    "id": "2QZgWpUXdo_I",
    "outputId": "8c0cab06-648b-4afa-e9b1-0ffdd958a479"
   },
   "outputs": [],
   "source": [
    "# First install biopython if needed\n",
    "if 0: # only in colab not in the JGU server\n",
    "    try:\n",
    "        #import google.colab\n",
    "        # Running on Google Colab, so install Biopython first\n",
    "        !pip install Bio # biopython\n",
    "    except ImportError:\n",
    "        pass\n",
    "\n",
    "# import the modules\n",
    "from Bio import SeqIO\n",
    "import requests\n",
    "import gzip\n",
    "from io import BytesIO\n",
    "import sys"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Amino acids (mass in Da)\n",
    "### Note \n",
    "1 Da = 1.660539 $10^{−27}$  Kg"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## The mass of the different amino acids\n",
    "this data structure is a dictionary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "amino_acid_masses = {\n",
    "    \"A\": 89.1,  # Alanine\n",
    "    \"R\": 174.2, # Arginine\n",
    "    \"N\": 132.1, # Asparagine\n",
    "    \"D\": 133.1, # Aspartic acid\n",
    "    \"C\": 121.2, # Cysteine\n",
    "    \"Q\": 146.2, # Glutamine\n",
    "    \"E\": 147.1, # Glutamic acid\n",
    "    \"G\": 75.1,  # Glycine\n",
    "    \"H\": 155.2, # Histidine\n",
    "    \"I\": 131.2, # Isoleucine\n",
    "    \"L\": 131.2, # Leucine\n",
    "    \"K\": 146.2, # Lysine\n",
    "    \"M\": 149.2, # Methionine\n",
    "    \"F\": 165.2, # Phenylalanine\n",
    "    \"P\": 115.1, # Proline\n",
    "    \"S\": 105.1, # Serine\n",
    "    \"T\": 119.1, # Threonine\n",
    "    \"W\": 204.2, # Tryptophan\n",
    "    \"Y\": 181.2, # Tyrosine\n",
    "    \"V\": 117.1  # Valine\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Calculate the weight of a molecule of water"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The molecular weight of one molecule of water is 18.016 Da.\n"
     ]
    }
   ],
   "source": [
    "# Atomic weights in g/mol\n",
    "hydrogen_weight = 1.008  # g/mol\n",
    "oxygen_weight = 16.00    # g/mol\n",
    "\n",
    "# Calculate molecular weight of water (H2O)\n",
    "water_weight = (2 * hydrogen_weight) + oxygen_weight\n",
    "\n",
    "# Output the result in Da (1 g/mol = 1 Da)\n",
    "print(f\"The molecular weight of one molecule of water is {water_weight} Da.\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "HymjvjIb85ED"
   },
   "source": [
    "## Retrieve the multifasta of the proteome (from internet in this case)\n",
    "and process it.\n",
    "\n",
    "## Parse the file.\n",
    "- Iterate all over the sequences\n",
    "- calculate in each sequence\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 1802,
     "status": "ok",
     "timestamp": 1731487416639,
     "user": {
      "displayName": "Enrique M. Muro",
      "userId": "08274162213514611628"
     },
     "user_tz": -60
    },
    "id": "0yNeWVk-iZms",
    "outputId": "ccdbd50a-663a-46f7-dd89-7961d6c2a3fd"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "https://cbdm-01.zdv.uni-mainz.de/~muro/teaching/data-teaching/ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Viruses/UP000464024/UP000464024_2697049.fasta.gz\n",
      "Results:\n",
      "The proteome has a total of 17 proteins\n",
      "Total number of aa in the proteome:  14439 aa\n",
      "Average length of a protein: 849.3529411764706 aa\n",
      "Total mass of the proteome: 1613005.8839999083 Da\n",
      "Average aa mass in this proteome (including the water molecule that is not in each dipeptide): 129.68033104785883 Da\n",
      "Average aa mass in this proteome (standard): 111.71174485767078 Da\n"
     ]
    }
   ],
   "source": [
    "# URL of the gzip fasta file\n",
    "# sars2\n",
    "url = \"https://cbdm-01.zdv.uni-mainz.de/~muro/teaching/data-teaching/ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Viruses/UP000464024/UP000464024_2697049.fasta.gz\"\n",
    "print(url)\n",
    "\n",
    "# load the gzip fasta file\n",
    "response = requests.get(url)\n",
    "# Was the request fine?\n",
    "if response.status_code == 200:\n",
    "    # load it in BytesIO. It will be kindy of a \"file on memory\"\n",
    "    gzip_io = BytesIO(response.content)\n",
    "\n",
    "    # Descompress, read each sequence and calculate \n",
    "    num_of_seqs    = 0\n",
    "    aa_in_proteome = 0\n",
    "    proteome_mass_with_water_molecule  = 0 # the water molecule is not in the polypeptide\n",
    "    proteome_mass  = 0 \n",
    "    with gzip.open(gzip_io, \"rt\") as fasta_io: # rt: read in text mode it is need\n",
    "        for record in SeqIO.parse(fasta_io, \"fasta\"):\n",
    "            num_of_seqs += 1                      # increase the seq counter\n",
    "            for aa in record.seq:\n",
    "                if num_of_seqs >1: # only remove the molecule of water when bounding a new aa\n",
    "                    proteome_mass = proteome_mass + amino_acid_masses.get(aa, 0.0) - water_weight # improve in case of unknown aa!\n",
    "                else:\n",
    "                    proteome_mass = proteome_mass + amino_acid_masses.get(aa, 0.0) # 1st aa\n",
    "                proteome_mass_with_water_molecule = proteome_mass_with_water_molecule + amino_acid_masses.get(aa, 0.0) \n",
    "            aa_in_proteome += len(record.seq)\n",
    "else:\n",
    "    print(f\"Couldn't get the file. HTTP status: {response.status_code}\")\n",
    "\n",
    "# results\n",
    "print(\"Results:\")\n",
    "print(\"The proteome has a total of\", num_of_seqs, \"proteins\")\n",
    "print(\"Total number of aa in the proteome: \", aa_in_proteome, \"aa\")\n",
    "print(f\"Average length of a protein: {aa_in_proteome/num_of_seqs} aa\")\n",
    "print(f\"Total mass of the proteome: {proteome_mass} Da\")\n",
    "print(f\"Average aa mass in this proteome (including the water molecule that is not in each dipeptide): {proteome_mass_with_water_molecule/aa_in_proteome} Da\")\n",
    "print(f\"Average aa mass in this proteome (standard): {proteome_mass/aa_in_proteome} Da\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "authorship_tag": "ABX9TyOClWfxv9E2hjAN2EMkU+jd",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}