# Open and parse a remote multifasta file

## The multifasta file compressed with gzip

https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Viruses/UP000464024/UP000464024_2697049.fasta.gz

### The directory that contains the multifasta file
[https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Viruses/UP000464024](https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Viruses/UP000464024)

**Note**: if the multifasta have a space-line between sequences, Biopython takes care of it!

### See the multifasta @

[https://cbdm-01.zdv.uni-mainz.de/~muro/teaching/data-teaching/ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Viruses/UP000464024/UP000464024_2697049.fasta](https://cbdm-01.zdv.uni-mainz.de/~muro/teaching/data-teaching/ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Viruses/UP000464024/UP000464024_2697049.fasta)

## Install and import Biopython

In [1]:
# First install biopython if needed
if 0: # only in colab not in the JGU server
 try:
 #import google.colab
 # Running on Google Colab, so install Biopython first
 !pip install Bio # biopython
 except ImportError:
 pass

# import the modules
from Bio import SeqIO
import requests
import gzip
from io import BytesIO
import sys

# Amino acids (mass in Da)
### Note 
1 Da = 1.660539 $10^{−27}$ Kg

## The mass of the different amino acids
this data structure is a dictionary

In [2]:
amino_acid_masses = {
 "A": 89.1, # Alanine
 "R": 174.2, # Arginine
 "N": 132.1, # Asparagine
 "D": 133.1, # Aspartic acid
 "C": 121.2, # Cysteine
 "Q": 146.2, # Glutamine
 "E": 147.1, # Glutamic acid
 "G": 75.1, # Glycine
 "H": 155.2, # Histidine
 "I": 131.2, # Isoleucine
 "L": 131.2, # Leucine
 "K": 146.2, # Lysine
 "M": 149.2, # Methionine
 "F": 165.2, # Phenylalanine
 "P": 115.1, # Proline
 "S": 105.1, # Serine
 "T": 119.1, # Threonine
 "W": 204.2, # Tryptophan
 "Y": 181.2, # Tyrosine
 "V": 117.1 # Valine
}

## Calculate the weight of a molecule of water

In [3]:
# Atomic weights in g/mol
hydrogen_weight = 1.008 # g/mol
oxygen_weight = 16.00 # g/mol

# Calculate molecular weight of water (H2O)
water_weight = (2 * hydrogen_weight) + oxygen_weight

# Output the result in Da (1 g/mol = 1 Da)
print(f"The molecular weight of one molecule of water is {water_weight} Da.")


The molecular weight of one molecule of water is 18.016 Da.


## Retrieve the multifasta of the proteome (from internet in this case)
and process it.

## Parse the file.
- Iterate all over the sequences
- calculate in each sequence



In [4]:
# URL of the gzip fasta file
# sars2
url = "https://cbdm-01.zdv.uni-mainz.de/~muro/teaching/data-teaching/ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Viruses/UP000464024/UP000464024_2697049.fasta.gz"
print(url)

# load the gzip fasta file
response = requests.get(url)
# Was the request fine?
if response.status_code == 200:
 # load it in BytesIO. It will be kindy of a "file on memory"
 gzip_io = BytesIO(response.content)

 # Descompress, read each sequence and calculate 
 num_of_seqs = 0
 aa_in_proteome = 0
 proteome_mass_with_water_molecule = 0 # the water molecule is not in the polypeptide
 proteome_mass = 0 
 with gzip.open(gzip_io, "rt") as fasta_io: # rt: read in text mode it is need
 for record in SeqIO.parse(fasta_io, "fasta"):
 num_of_seqs += 1 # increase the seq counter
 for aa in record.seq:
 if num_of_seqs >1: # only remove the molecule of water when bounding a new aa
 proteome_mass = proteome_mass + amino_acid_masses.get(aa, 0.0) - water_weight # improve in case of unknown aa!
 else:
 proteome_mass = proteome_mass + amino_acid_masses.get(aa, 0.0) # 1st aa
 proteome_mass_with_water_molecule = proteome_mass_with_water_molecule + amino_acid_masses.get(aa, 0.0) 
 aa_in_proteome += len(record.seq)
else:
 print(f"Couldn't get the file. HTTP status: {response.status_code}")

# results
print("Results:")
print("The proteome has a total of", num_of_seqs, "proteins")
print("Total number of aa in the proteome: ", aa_in_proteome, "aa")
print(f"Average length of a protein: {aa_in_proteome/num_of_seqs} aa")
print(f"Total mass of the proteome: {proteome_mass} Da")
print(f"Average aa mass in this proteome (including the water molecule that is not in each dipeptide): {proteome_mass_with_water_molecule/aa_in_proteome} Da")
print(f"Average aa mass in this proteome (standard): {proteome_mass/aa_in_proteome} Da")


https://cbdm-01.zdv.uni-mainz.de/~muro/teaching/data-teaching/ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Viruses/UP000464024/UP000464024_2697049.fasta.gz
Results:
The proteome has a total of 17 proteins
Total number of aa in the proteome: 14439 aa
Average length of a protein: 849.3529411764706 aa
Total mass of the proteome: 1613005.8839999083 Da
Average aa mass in this proteome (including the water molecule that is not in each dipeptide): 129.68033104785883 Da
Average aa mass in this proteome (standard): 111.71174485767078 Da
