In [2]:
amino_acid_masses = {
        '': 0,
        'G': 57,
        'A': 71,
        'S': 87,
        'P': 97,
        'V': 99,
        'T': 101,
        'C': 103,
        'I': 113,
        'L': 113,
        'N': 114,
        'D': 115,
        'K': 128,
        'Q': 128,
        'E': 129,
        'M': 131,
        'H': 137,
        'F': 147,
        'R': 156,
        'Y': 163,
        'W': 186,
    }
In [179]:
def linear_spectrum(peptide):
    n = len(peptide)
    prefix_mass = [0 for i in range(n + 1)]
    
    for i in range(n):
        aa = peptide[i]
        prefix_mass[i + 1] = prefix_mass[i] + amino_acid_masses[aa]

    spectrum = [0]
    
    for i in range(n):
        for j in range(i + 1, n + 1):
            spectrum.append(prefix_mass[j] - prefix_mass[i])
            
    spectrum.sort()
    return spectrum
    
linear_spectrum('MATFMATFMATF')
Out[179]:
[0,
 71,
 71,
 71,
 101,
 101,
 101,
 131,
 131,
 131,
 147,
 147,
 147,
 172,
 172,
 172,
 202,
 202,
 202,
 248,
 248,
 248,
 278,
 278,
 303,
 303,
 303,
 319,
 319,
 319,
 349,
 349,
 379,
 379,
 450,
 450,
 450,
 450,
 450,
 450,
 450,
 450,
 450,
 521,
 521,
 551,
 551,
 581,
 581,
 597,
 597,
 622,
 622,
 652,
 652,
 698,
 698,
 728,
 753,
 753,
 769,
 769,
 799,
 829,
 900,
 900,
 900,
 900,
 900,
 971,
 1001,
 1031,
 1047,
 1072,
 1102,
 1148,
 1203,
 1219,
 1350]
In [161]:
def cyclic_spectrum(peptide):
    n = len(peptide)
    prefix_mass = [0 for i in range(n + 1)]
    
    for i in range(n):
        aa = peptide[i]
        prefix_mass[i + 1] = prefix_mass[i] + amino_acid_masses[aa]

    spectrum = [0]
    peptide_mass = prefix_mass[-1]
    
    for i in range(n):
        for j in range(i + 1, n + 1):
            segment_sum = prefix_mass[j] - prefix_mass[i]
            spectrum.append(segment_sum)
            
            if i > 0 and j < n:
                spectrum.append(peptide_mass - segment_sum)
            
    spectrum.sort()
    return spectrum
    
cyclic_spectrum('NQEL')
Out[161]:
[0, 113, 114, 128, 129, 227, 242, 242, 257, 355, 356, 370, 371, 484]
In [22]:
def extend(peptides):
    extension = []
    
    for peptide in peptides:
        for aa in amino_acid_masses:
            if aa != '':
                extension.append(peptide + aa)
            
    return extension
In [24]:
peptides = ['SP']
extend(peptides)
Out[24]:
['SPG',
 'SPA',
 'SPS',
 'SPP',
 'SPV',
 'SPT',
 'SPC',
 'SPI',
 'SPL',
 'SPN',
 'SPD',
 'SPK',
 'SPQ',
 'SPE',
 'SPM',
 'SPH',
 'SPF',
 'SPR',
 'SPY',
 'SPW']
In [49]:
def consistent(peptide, spectrum):
    peptide_linear_spectrum = linear_spectrum(peptide)
    
    n = len(peptide_linear_spectrum)
    m = len(spectrum)
    
    i = 0
    j = 0
    
    while i < n:
        while spectrum[j] != peptide_linear_spectrum[i]:
            j += 1
            if j == m:
                return False
        i += 1
        j += 1
        
    return True
In [28]:
def mass(peptide):
    total_mass = 0
    for aa in peptide:
        total_mass += amino_acid_masses[aa]
        
    return total_mass
In [54]:
def cyclopeptide_sequencing(spectrum):
    results = []
    peptides = ['']
    parent_mass = spectrum[-1]
    
    while len(peptides) > 0:
        extension = extend(peptides)
        survived = []
        
        for peptide in extension:
            if mass(peptide) == parent_mass:
                if cyclic_spectrum(peptide) == spectrum:
                    results.append(peptide)
                
            elif consistent(peptide, spectrum):
                survived.append(peptide)
        
        peptides = survived
            
    return results
In [164]:
# spectrum = [0, 113, 114, 128, 129, 227, 242, 242, 257, 355, 356, 370, 371, 484]

spectrum = [0, 113, 114, 128, 129, 129, 139, 242, 370, 371, 484]
cyclopeptide_sequencing(spectrum)
Out[164]:
[]
In [68]:
def score(peptide_spectrum, target_spectrum):
    n = len(peptide_spectrum)
    m = len(target_spectrum)
    
    i = 0
    j = 0
    last_found = 0
    
    total_score = 0
    
    while i < n:
        j = last_found
        
        while j < m and target_spectrum[j] <= peptide_spectrum[i]:
            if peptide_spectrum[i] == target_spectrum[j]:
                last_found = j + 1
                total_score += 1
                break
            j += 1
            
        i += 1
            
    return total_score
In [71]:
def linear_score(peptide, spectrum):
    peptide_linear_spectrum = linear_spectrum(peptide)
    return score(peptide_linear_spectrum, spectrum)
In [72]:
def cyclic_score(peptide, spectrum):
    peptide_cyclic_spectrum = cyclic_spectrum(peptide)
    return score(peptide_cyclic_spectrum, spectrum)
In [69]:
s1 = [0,1,2,2,4]
s2 = [0, 1, 2, 3, 4, 5]

score(s1, s2)
Out[69]:
4
In [84]:
peptide = 'MAVT'

print(linear_score(peptide, spectrum))
5
In [128]:
def trim(leaderboard, spectrum, N): 
    if len(leaderboard) <= N:
        return leaderboard
    
    sorted_leaderboard = []
    
    for peptide in leaderboard:
        peptide_linear_score = linear_score(peptide, spectrum)
        sorted_leaderboard.append((peptide_linear_score, peptide))
        
    sorted_leaderboard.sort(reverse=True)
    
    for i in range(N,len(leaderboard)):
        if sorted_leaderboard[i][0] < sorted_leaderboard[N - 1][0]:
            trimmed = sorted_leaderboard[:i]
            return [x[1] for x in trimmed]
    trimmed = sorted_leaderboard
    return [x[1] for x in trimmed]
In [129]:
def leaderboard_cyclopeptide_sequencing(spectrum, N):
    leaderboard = ['']
    leader_peptide = ''
    leader_peptide_score = 0
    
    parent_mass = spectrum[-1]
    
    while len(leaderboard) > 0:
        extension = extend(leaderboard)
        survived = []
        
        for peptide in extension:
            peptide_mass = mass(peptide)
            if peptide_mass == parent_mass:
                peptide_score = cyclic_score(peptide, spectrum)
                if peptide_score > leader_peptide_score:
                    leader_peptide_score = peptide_score
                    leader_peptide = peptide
                    
                survived.append(peptide)
                    
            elif peptide_mass <= parent_mass:
                survived.append(peptide)
                
        leaderboard = trim(survived, spectrum, N)
    
    return leader_peptide
                    
            
In [190]:
spectrum = [0,
 71,
 71,
 71,
 101,
 101,
 101,
 131,
 131,
 131,
 147,
 147,
 147,
 172,
 172,
 172,
 202,
 202,
 202,
 248,
 248,
 248,
 278,
 278,
 303,
 303,
 303,
 319,
 319,
 319,
 349,
 349,
 379,
 379,
 450,
 450,
 450,
 450,
 450,
 450,
 450,
 450,
 450,
 521,
 521,
 551,
 551,
 581,
 581,
 597,
 597,
 622,
 622,
 652,
 652,
 698,
 698,
 728,
 753,
 753,
 769,
 769,
 799,
 829,
 900,
 900,
 900,
 900,
 900,
 971,
 1001,
 1031,
 1047,
 1072,
 1102,
 1148,
 1203,
 1219,
 1350]
N = 10
leaderboard_cyclopeptide_sequencing(spectrum, N)
Out[190]:
'TAMFTAMFTAMF'