Pronalaženje motiva u skupu sekvenci

Pomoćne funkcije za prevođenje DNK sekvenci u brojeve i obrnuto:

In [7]:
def symbol_to_number(symbol):
    mapping = {
        'A': 0,
        'T': 1,
        'C': 2,
        'G': 3,
        'a': 0,
        't': 1,
        'c': 2,
        'g': 3
    }
    return mapping[symbol]

def pattern_to_number(pattern):
    if len(pattern) == 1:
        return symbol_to_number(pattern)
    else:
        prefix = pattern[:-1]
        suffix = pattern[-1]
        suffix_number = symbol_to_number(suffix)
        return pattern_to_number(prefix) * 4 + suffix_number
In [8]:
pattern_to_number('ATCG')
Out[8]:
27
In [9]:
def number_to_symbol(number):
    mapping = {
        0: 'A',
        1: 'T',
        2: 'C',
        3: 'G'
    }

    return mapping[number]
In [10]:
def number_to_pattern(number, k):
    if k == 1:
        return number_to_symbol(number)
    else:
        prefix = number // 4
        suffix = number % 4
        suffix_symbol = number_to_symbol(suffix)
        return number_to_pattern(prefix, k - 1) + suffix_symbol
In [11]:
number_to_pattern(27, 10)
Out[11]:
'AAAAAAATCG'

Funkcija hamming_distance izracunava broj pozicija na kojima se dve niske razlikuju.

In [12]:
def hamming_distance(pattern_1, pattern_2):
    n = len(pattern_1)
    distance = 0
    
    for i in range(n):
        if pattern_1[i] != pattern_2[i]:
            distance += 1
            
    return distance

Funkcija distance pronalazi minimalnu hamingovu distancu zadate niske pattern u odnosu na sve podniske niske dna_sequences dužine k.

In [13]:
def distance(dna_sequences, pattern, k):
    total_distance = 0
    for sequence in dna_sequences:
        minimal_distance = float('inf')
        
        for i in range(len(sequence) - k + 1):
            current_pattern = sequence[i:i + k]
            current_distance = hamming_distance(current_pattern, pattern)

            if current_distance < minimal_distance:
                minimal_distance = current_distance

        total_distance += minimal_distance
        
    return total_distance    

Funkcija median_string pronalazi mediana nisku motiva iz zadatih DNK sekvenci.

In [14]:
def median_string(dna_sequences, k):
    min_distance = float('inf')
    min_pattern = ''
    
    for i in range(4 ** k):
        pattern = number_to_pattern(i, k)
        current_distance = distance(dna_sequences, pattern, k)
        if min_distance > current_distance:
            min_distance = current_distance
            min_pattern = pattern
    return min_pattern
In [15]:
dna_sequences = ['ATCGATCGTTTG', 
                 'AGCGCTCGTTCG', 
                 'ACCGTTTGATAG', 
                 'AACGATTTTCG', 
                 'AACGATCTTTCG']
k = 10

median_string(dna_sequences, k)
Out[15]:
'AACGATCGTT'

Funkcija score izračunava skor trenutnog skupa motiva

In [16]:
def score(motifs, k, t):
    #          A  T  C  G
    counts = [[0, 0, 0, 0] for i in range(k)]
    
    total_score = 0
    
    for sequence in motifs:
        for i in range(k):
            current_symbol = sequence[i]
            index = symbol_to_number(current_symbol)
            counts[i][index] += 1
    
    for sequence in motifs:
        for i in range(k):
            most_frequent = counts[i].index(max(counts[i]))
            if most_frequent != symbol_to_number(sequence[i]):
                total_score += 1

    return total_score
In [17]:
motifs = ['TCGGGGgTTTtt',
'cCGGtGAcTTaC',
'aCGGGGATTTtC',
'TtGGGGAcTTtt',
'aaGGGGAcTTCC',
'TtGGGGAcTTCC',
'TCGGGGATTcat',
'TCGGGGATTcCt',
'TaGGGGAacTaC',
'TCGGGtATaaCC']
k = 12
t = 10

score(motifs, k, t)
Out[17]:
30

Funkcija create_profile formira profil matricu od zadatog skupa motiva.

In [53]:
def create_profile(motifs, k, t):
    pseudocount = 1
    profile = [
        [pseudocount/(t + 4 * pseudocount) for i in range(4)]
        for i in range(k)]
    
    for sequence in motifs:
        for i in range(k):
            current_symbol = sequence[i]
            index = symbol_to_number(current_symbol)
            profile[i][index] += 1 / (t + 4 * pseudocount)
            
    return profile
In [54]:
create_profile(motifs, k, t)
Out[54]:
[[0.3333333333333333,
  0.8888888888888891,
  0.2222222222222222,
  0.1111111111111111],
 [0.3333333333333333,
  0.3333333333333333,
  0.7777777777777779,
  0.1111111111111111],
 [0.1111111111111111,
  0.1111111111111111,
  0.1111111111111111,
  1.2222222222222225],
 [0.1111111111111111,
  0.1111111111111111,
  0.1111111111111111,
  1.2222222222222225]]

Funkcija probability izračunava verovatnoću niske current_pattern u odnosu na profilnu matricu.

In [55]:
def probability(profile, current_pattern):
    prob = 1
    for i in range(len(current_pattern)):
        current_symbol = current_pattern[i]
        index = symbol_to_number(current_symbol)
        prob *= profile[i][index]
        
    return prob

Funkcija most_probable_kmer pronalazi najverovatniji k-gram iz niske dna_string u odnosu na zadatu profil matricu

In [21]:
def most_probable_kmer(profile, dna_string, k):
    max_prob = -1
    max_prob_pattern = ''
    
    for i in range(len(dna_string) - k):
        current_pattern = dna_string[i:i + k]
        current_prob = probability(profile, current_pattern)
        if (current_prob > max_prob):
            max_prob = current_prob
            max_prob_pattern = current_pattern
    
    return max_prob_pattern
        

Funkcija greedy_motif_search pronalazi skup motiva dužine k iz niza od t DNK sekvenci dna_sequences pohlepnim pristupom, formirajuci iterativno skup motiva dodavanjem najverovatnijeg k-grama iz i-te sekvence u odnosu na profil matricu sastavljenu od motiva iz i-1 sekvenci.

In [22]:
import copy

def greedy_motif_search(dna_sequences, k, t): # t - broj sekvenci
    best_motifs = [sequence[:k] for sequence in dna_sequences]
    best_score = score(best_motifs, k, t)
    
    first_string = dna_sequences[0]
    
    for i in range(len(first_string) - k + 1):
        motif1 = first_string[i:i + k]
        motifs = [motif1]
        
        for j in range(1, t):
            profile = create_profile(motifs, k, t)
            current_string = dna_sequences[j]
            new_motif = most_probable_kmer(profile, current_string, k)
            motifs.append(new_motif)
    
        current_score = score(motifs, k, t)
        if current_score < best_score:
            best_score = current_score
            # best_motifs = motifs
            best_motifs = copy.deepcopy(motifs)
            
    return best_motifs
    
In [23]:
dna_sequences = ['TTCGATCGTTTG', 
                 'AGCGCTCGTTCG', 
                 'CCCGTTTGATAG', 
                 'AACGATTTTCG', 
                 'AACGATCTTTCG']
k = 4
t = len(dna_sequences)

greedy_motif_search(dna_sequences, k, t)
Out[23]:
['CGAT', 'CGCT', 'CGTT', 'CGAT', 'CGAT']

Funkcija motifs_from_profile pronalazi najverovatnije k-grame iz zadatih sekvenci u odnosu na profil matricu i od njih formira skup motiva.

In [24]:
def motifs_from_profile(profile, dna_sequences, k, t):
    motifs = []
    for sequence in dna_sequences:
        motifs.append(most_probable_kmer(profile, sequence, k))
    return motifs

Funkcija randomized_motif_search pronalazi skup motiva dužine k iz niza od t DNK sekvenci dna_sequences polaskom od skupa motiva odabranih pseudoslučajnim pristupom i zatim iterativnim formiranjem profila od skupa motiva praćenim formiranjem novog skupa motiva u odnosu na dobijeni profil.

In [25]:
import random

def randomized_motif_search(dna_sequences, k, t):
    best_motifs = []
    for sequence in dna_sequences:
        i = random.randrange(0, len(sequence) - k + 1)
        best_motifs.append(sequence[i:i + k])
        
    best_score = score(best_motifs, k, t)
    
    motifs = copy.deepcopy(best_motifs)
    while True:
        profile = create_profile(motifs, k, t)
        motifs = motifs_from_profile(profile, dna_sequences, k, t)
        
        current_score = score(motifs, k, t)
        if current_score < best_score:
            best_score = current_score
            best_motifs = copy.deepcopy(motifs)
        else:
            break
            
    return best_motifs
        
        
In [205]:
dna_sequences = ['ttACCTtaac',
'gATGTctgtc',
'ccgGCGTtag',
'cactaACGAg',
'cgtcagAGGT']
k = 4
t = len(dna_sequences)

randomized_motif_search(dna_sequences, k, t)
Out[205]:
['ACCT', 'gATG', 'gGCG', 'aACG', 'gAGG']
In [66]:
def most_probable_gibbs_kmer(profile, dna_sequence, k):
    kmers = []
    probabilities = []
    
    max_probability = float('-inf')
    min_probability = float('inf')
    
    for i in range(len(dna_sequence) - k + 1):
        kmer = dna_sequence[i:i + k]
        kmers.append(kmer)
        
        kmer_probability = probability(profile, kmer)
        probabilities.append(kmer_probability)
    
    
    probability_sum = sum(probabilities)
    random_point = random.random() * probability_sum
    
    current_sum = 0
    for i in range(len(probabilities)):
        p = probabilities[i]
        
        current_sum += p
        
        if current_sum >= random_point:
            return kmers[i]

Funkcija gibbs_sampler pronalazi skup motiva dužine k iz niza od t DNK sekvenci dna_sequences. Algoritam polazi od skupa motiva odabranih pseudoslučajnim pristupom a zatim iterativno uklanja i-ti motiva iz skupa motiva (za pseudoslučajno odabrani indeks i), formira profil od preostalih motifa i u skup motiva dodaje k-gram iz i-te sekvence odabran pristrasnim pseudoslučajnim izborom. Ukoliko je u nekoj iteraciji dobijen skup motiva sa boljim skorom od do sada pronadjenim najboljim, tekući skup motiva se postavlja za trenutno najbolji.

In [67]:
def gibbs_sampler(dna_sequences, k, t, N):
    best_motifs = []
    for sequence in dna_sequences:
        i = random.randrange(0, len(sequence) - k + 1)
        best_motifs.append(sequence[i:i + k])
        
    best_score = score(best_motifs, k, t)
        
    motifs = copy.deepcopy(best_motifs)
    for i in range(N):
        i = random.randrange(t)
        del motifs[i]
        
        profile = create_profile(motifs, k, t - 1)
        new_motif_i = most_probable_gibbs_kmer(profile, dna_sequences[i], k)
        motifs.insert(i, new_motif_i)
        
        current_score = score(motifs, k, t)
        if current_score < best_score:
            best_score = current_score
            best_motifs = copy.deepcopy(motifs)
            
    return best_motifs       
In [165]:
dna_sequences = ['ttACCTtaac',
'gATGTctgtc',
'ccgGCGTtag',
'cactaACGAg',
'cgtcagAGGT']
k = 4
t = len(dna_sequences)
N = 1000

gibbs_sampler(dna_sequences, k, t, N)
Out[165]:
['ACCT', 'ATGT', 'GCGT', 'ACGA', 'AGGT']