{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "l5TuW6z28Jdq"
   },
   "source": [
    "# Open and parse a remote multifasta file"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "XCiKVC1zO3Cs"
   },
   "source": [
    "## The multifasta file is compressed with gzip\n",
    "\n",
    "### The directory that contains the multifasta file\n",
    "[https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Viruses/UP000464024/](https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Viruses/UP000464024/)\n",
    "\n",
    "**Note**: Not the case, but if the multifasta has a space-line between sequences, Biopython takes care of it!\n",
    "\n",
    "### See the multifasta uncompressed\n",
    "[https://cbdm-01.zdv.uni-mainz.de/~muro/teaching/data-teaching/ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Viruses/UP000464024/UP000464024_2697049.fasta](https://cbdm-01.zdv.uni-mainz.de/~muro/teaching/data-teaching/ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Viruses/UP000464024/UP000464024_2697049.fasta)\n",
    "\n",
    "The compressed file is in the same directory in UP000464024_2697049.fasta.gz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "z1OM1VQ48VVA"
   },
   "source": [
    "## Install and import Biopython"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 9326,
     "status": "ok",
     "timestamp": 1731487236970,
     "user": {
      "displayName": "Enrique M. Muro",
      "userId": "08274162213514611628"
     },
     "user_tz": -60
    },
    "id": "2QZgWpUXdo_I",
    "outputId": "8c0cab06-648b-4afa-e9b1-0ffdd958a479"
   },
   "outputs": [],
   "source": [
    "# Install and import from biopython\n",
    "if 0:\n",
    "    try:\n",
    "        #import google.colab\n",
    "        # Running on Google Colab, so install Biopython first\n",
    "        !pip install Bio # biopython\n",
    "    except ImportError:\n",
    "        pass\n",
    "\n",
    "from Bio import SeqIO\n",
    "import requests # HTTP requests\n",
    "import gzip\n",
    "from io import BytesIO\n",
    "import sys"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "HymjvjIb85ED"
   },
   "source": [
    "## Retrieve the multifasta. From internet if it is not available locally\n",
    "\n",
    "## Parse the file.\n",
    "- Iterate all over the sequences\n",
    "- Show the first sequence the descriptor, sequence and calculate the sequence length\n",
    "- Count the number of sequences and the total number of amino acids in the proteome\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 1802,
     "status": "ok",
     "timestamp": 1731487416639,
     "user": {
      "displayName": "Enrique M. Muro",
      "userId": "08274162213514611628"
     },
     "user_tz": -60
    },
    "id": "0yNeWVk-iZms",
    "outputId": "ccdbd50a-663a-46f7-dd89-7961d6c2a3fd"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "https://cbdm-01.zdv.uni-mainz.de/~muro/teaching/data-teaching/ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Viruses/UP000464024/UP000464024_2697049.fasta.gz\n",
      "<class 'Bio.SeqRecord.SeqRecord'>\n",
      "SeqRecord(seq=Seq('MGYINVFAFPFTIYSLLLCRMNSRNYIAQVDVVNFNLT'), id='sp|A0A663DJA2|ORF10_SARS2', name='sp|A0A663DJA2|ORF10_SARS2', description='sp|A0A663DJA2|ORF10_SARS2 Putative ORF10 protein OS=Severe acute respiratory syndrome coronavirus 2 OX=2697049 GN=ORF10 PE=5 SV=1', dbxrefs=[])\n",
      "\n",
      "Descripción: sp|A0A663DJA2|ORF10_SARS2 Putative ORF10 protein OS=Severe acute respiratory syndrome coronavirus 2 OX=2697049 GN=ORF10 PE=5 SV=1\n",
      "ID: sp|A0A663DJA2|ORF10_SARS2\n",
      "Uniprot accession number: A0A663DJA2\n",
      "Sequence: MGYINVFAFPFTIYSLLLCRMNSRNYIAQVDVVNFNLT\n",
      "Results:\n",
      "The fasta file has a total of 17 sequences\n",
      "Total number of aa in the proteome:  14439 aa\n",
      "Average length of a protein:  849.3529411764706 aa\n"
     ]
    }
   ],
   "source": [
    "# URL of the gzip fasta file\n",
    "#url = \"https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640/UP000005640_9606.fasta.gz\"\n",
    "url = \"https://cbdm-01.zdv.uni-mainz.de/~muro/teaching/data-teaching/ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Viruses/UP000464024/UP000464024_2697049.fasta.gz\"\n",
    "print(url)\n",
    "\n",
    "# load the gzip fasta file\n",
    "response = requests.get(url)\n",
    "# Was the request fine?\n",
    "if response.status_code == 200:\n",
    "    # load it in BytesIO. It will be like a \"file on memory\"\n",
    "    gzip_io = BytesIO(response.content)\n",
    "\n",
    "    # Descompress and read\n",
    "    num_of_seqs = 0\n",
    "    aa_in_proteome = 0\n",
    "    with gzip.open(gzip_io, \"rt\") as fasta_io: # rt: read in text mode\n",
    "        for record in SeqIO.parse(fasta_io, \"fasta\"):\n",
    "            num_of_seqs += 1                      # increase the seq counter\n",
    "            if num_of_seqs == 1: # just for displaying info on the first sequence\n",
    "              print(type(record))\n",
    "              print(repr(record)) # repr provides a printable from an object\n",
    "              #\n",
    "              print(f\"\\nDescripción: {record.description}\")\n",
    "              print(f\"ID: {record.id}\")             # print the accession number from Uniprot\n",
    "              acc_num = record.id                   # get the fasta descriptor\n",
    "              acc_num = acc_num.split('|')[1]\n",
    "              print(f\"Uniprot accession number: {acc_num}\") # print the accession number from Uniprot\n",
    "              print(f\"Sequence: {record.seq}\")\n",
    "\n",
    "            aa_in_proteome += len(record.seq)\n",
    "else:\n",
    "    print(f\"Couldn't get the file. HTTP status: {response.status_code}\")\n",
    "\n",
    "# results\n",
    "print(\"Results:\")\n",
    "print(\"The fasta file has a total of\", num_of_seqs, \"sequences\")\n",
    "print(\"Total number of aa in the proteome: \", aa_in_proteome, \"aa\")\n",
    "print(\"Average length of a protein: \", aa_in_proteome/num_of_seqs, \"aa\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "OHyuC84ZVCs6"
   },
   "source": [
    "### Access to the 4th sequence in the multifasta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 707,
     "status": "ok",
     "timestamp": 1731487430671,
     "user": {
      "displayName": "Enrique M. Muro",
      "userId": "08274162213514611628"
     },
     "user_tz": -60
    },
    "id": "hdi9uoyEhlac",
    "outputId": "c646d61f-1ca9-4747-d493-69db2fec8f13"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The sequence number 4 is:\n",
      "<class 'Bio.SeqRecord.SeqRecord'>\n",
      "seq id:\t sp|P0DTC3|AP3A_SARS2\n",
      "seq:\t MDLFMRIFTIGTVTLKQGEIKDATPSDFVRATATIPIQASLPFGWLIVGVALLAVFQSASKIITLKKRWQLALSKGVHFVCNLLLLFVTVYSHLLLVAAGLEAPFLYLYALVYFLQSINFVRIIMRLWLCWKCRSKNPLLYDANYFLCWHTNCYDYCIPYNSVTSSIVITSGDGTTSPISEHDYQIGGYTEKWESGVKDCVVLHSYFTSDYYQLYSTQLSTDTGVEHVTFFIYNKIVDEPEEHVQIHTIDGSSGVVNPVMEPIYDEPTTTTSVPL\n",
      "seq len: 275 \n",
      "\n"
     ]
    }
   ],
   "source": [
    "gzip_io = BytesIO(response.content)\n",
    "with gzip.open(gzip_io, \"rt\") as fasta_io: # rt: read in text mode it is need\n",
    "  my_parsed_seqio_l = list(SeqIO.parse(fasta_io, \"fasta\"))\n",
    "  selected_seq = my_parsed_seqio_l[3]\n",
    "  print(\"The sequence number 4 is:\", \"\\n\", type(selected_seq), sep=\"\") # starting from 0\n",
    "  print(\"seq id:\\t\", selected_seq.id)\n",
    "  selected_seq = selected_seq.seq\n",
    "  print(\"seq:\\t\", selected_seq)\n",
    "  print(\"seq len:\", len(selected_seq), \"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "OHyuC84ZVCs6"
   },
   "source": [
    "### Access to a sequence given a Uniprot accession number (ie. P0DTC2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 707,
     "status": "ok",
     "timestamp": 1731487430671,
     "user": {
      "displayName": "Enrique M. Muro",
      "userId": "08274162213514611628"
     },
     "user_tz": -60
    },
    "id": "hdi9uoyEhlac",
    "outputId": "c646d61f-1ca9-4747-d493-69db2fec8f13"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accession: sp|P0DTC3|AP3A_SARS2\n",
      "Description: sp|P0DTC3|AP3A_SARS2 ORF3a protein OS=Severe acute respiratory syndrome coronavirus 2 OX=2697049 GN=3a PE=1 SV=1\n",
      "Sequence: MDLFMRIFTIGTVTLKQGEIKDATPSDFVRATATIPIQASLPFGWLIVGVALLAVFQSASKIITLKKRWQLALSKGVHFVCNLLLLFVTVYSHLLLVAAGLEAPFLYLYALVYFLQSINFVRIIMRLWLCWKCRSKNPLLYDANYFLCWHTNCYDYCIPYNSVTSSIVITSGDGTTSPISEHDYQIGGYTEKWESGVKDCVVLHSYFTSDYYQLYSTQLSTDTGVEHVTFFIYNKIVDEPEEHVQIHTIDGSSGVVNPVMEPIYDEPTTTTSVPL\n",
      "Sequence length: 275 aa\n"
     ]
    }
   ],
   "source": [
    "def get_sequence_by_accession(records, accession):\n",
    "    # Iterate over each sequence from the multifasta\n",
    "    for record in records:\n",
    "        if accession in record.id: # is the accession in the id of the entry?\n",
    "            return record\n",
    "    return None # no accession found\n",
    "\n",
    "def get_sequence_by_gene_name(records, key):\n",
    "    # Iterate over each sequence from the multifasta\n",
    "    for record in records:\n",
    "        if key in record.description: # is the accession in the id of the entry?\n",
    "            return record\n",
    "    return None # no accession found\n",
    "\n",
    "KEY = \"P0DTC3\" #\"42858\" #\"69905\" #\"P60484\"  # accession\n",
    "#KEY = \"TOP1\" #\"ADH4\" #\"PDGFB\" \"PTEN\" #\"HBA\"  # gene name\n",
    "#\n",
    "gzip_io = BytesIO(response.content)\n",
    "with gzip.open(gzip_io, \"rt\") as fasta_io: # rt: read in text mode it is need\n",
    "    records = SeqIO.parse(fasta_io, \"fasta\")\n",
    "    record = get_sequence_by_accession(records, KEY)\n",
    "    #record = get_sequence_by_gene_name(records, KEY)\n",
    "    \n",
    "    if record:\n",
    "        print(f\"Accession: {record.id}\")\n",
    "        print(f\"Description: {record.description}\")\n",
    "        print(f\"Sequence: {record.seq}\")\n",
    "        print(f\"Sequence length: {len(record.seq)} aa\")\n",
    "    else:\n",
    "        print(\"Key (accession not found in the multifasta.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "authorship_tag": "ABX9TyOClWfxv9E2hjAN2EMkU+jd",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}