Pomoćne funkcije za prevođenje DNK sekvenci u brojeve i obrnuto:
def symbol_to_number(symbol):
mapping = {
'A': 0,
'T': 1,
'C': 2,
'G': 3,
'a': 0,
't': 1,
'c': 2,
'g': 3
}
return mapping[symbol]
def pattern_to_number(pattern):
if len(pattern) == 1:
return symbol_to_number(pattern)
else:
prefix = pattern[:-1]
suffix = pattern[-1]
suffix_number = symbol_to_number(suffix)
return pattern_to_number(prefix) * 4 + suffix_number
pattern_to_number('ATCG')
def number_to_symbol(number):
mapping = {
0: 'A',
1: 'T',
2: 'C',
3: 'G'
}
return mapping[number]
def number_to_pattern(number, k):
if k == 1:
return number_to_symbol(number)
else:
prefix = number // 4
suffix = number % 4
suffix_symbol = number_to_symbol(suffix)
return number_to_pattern(prefix, k - 1) + suffix_symbol
number_to_pattern(27, 10)
Funkcija hamming_distance izracunava broj pozicija na kojima se dve niske razlikuju.
def hamming_distance(pattern_1, pattern_2):
n = len(pattern_1)
distance = 0
for i in range(n):
if pattern_1[i] != pattern_2[i]:
distance += 1
return distance
Funkcija distance pronalazi minimalnu hamingovu distancu zadate niske pattern u odnosu na sve podniske niske dna_sequences dužine k.
def distance(dna_sequences, pattern, k):
total_distance = 0
for sequence in dna_sequences:
minimal_distance = float('inf')
for i in range(len(sequence) - k + 1):
current_pattern = sequence[i:i + k]
current_distance = hamming_distance(current_pattern, pattern)
if current_distance < minimal_distance:
minimal_distance = current_distance
total_distance += minimal_distance
return total_distance
Funkcija median_string pronalazi mediana nisku motiva iz zadatih DNK sekvenci.
def median_string(dna_sequences, k):
min_distance = float('inf')
min_pattern = ''
for i in range(4 ** k):
pattern = number_to_pattern(i, k)
current_distance = distance(dna_sequences, pattern, k)
if min_distance > current_distance:
min_distance = current_distance
min_pattern = pattern
return min_pattern
dna_sequences = ['ATCGATCGTTTG',
'AGCGCTCGTTCG',
'ACCGTTTGATAG',
'AACGATTTTCG',
'AACGATCTTTCG']
k = 10
median_string(dna_sequences, k)
Funkcija score izračunava skor trenutnog skupa motiva
def score(motifs, k, t):
# A T C G
counts = [[0, 0, 0, 0] for i in range(k)]
total_score = 0
for sequence in motifs:
for i in range(k):
current_symbol = sequence[i]
index = symbol_to_number(current_symbol)
counts[i][index] += 1
for sequence in motifs:
for i in range(k):
most_frequent = counts[i].index(max(counts[i]))
if most_frequent != symbol_to_number(sequence[i]):
total_score += 1
return total_score
motifs = ['TCGGGGgTTTtt',
'cCGGtGAcTTaC',
'aCGGGGATTTtC',
'TtGGGGAcTTtt',
'aaGGGGAcTTCC',
'TtGGGGAcTTCC',
'TCGGGGATTcat',
'TCGGGGATTcCt',
'TaGGGGAacTaC',
'TCGGGtATaaCC']
k = 12
t = 10
score(motifs, k, t)
Funkcija create_profile formira profil matricu od zadatog skupa motiva.
def create_profile(motifs, k, t):
pseudocount = 1
profile = [
[pseudocount/(t + 4 * pseudocount) for i in range(4)]
for i in range(k)]
for sequence in motifs:
for i in range(k):
current_symbol = sequence[i]
index = symbol_to_number(current_symbol)
profile[i][index] += 1 / (t + 4 * pseudocount)
return profile
create_profile(motifs, k, t)
Funkcija probability izračunava verovatnoću niske current_pattern u odnosu na profilnu matricu.
def probability(profile, current_pattern):
prob = 1
for i in range(len(current_pattern)):
current_symbol = current_pattern[i]
index = symbol_to_number(current_symbol)
prob *= profile[i][index]
return prob
Funkcija most_probable_kmer pronalazi najverovatniji k-gram iz niske dna_string u odnosu na zadatu profil matricu
def most_probable_kmer(profile, dna_string, k):
max_prob = -1
max_prob_pattern = ''
for i in range(len(dna_string) - k):
current_pattern = dna_string[i:i + k]
current_prob = probability(profile, current_pattern)
if (current_prob > max_prob):
max_prob = current_prob
max_prob_pattern = current_pattern
return max_prob_pattern
Funkcija greedy_motif_search pronalazi skup motiva dužine k iz niza od t DNK sekvenci dna_sequences pohlepnim pristupom, formirajuci iterativno skup motiva dodavanjem najverovatnijeg k-grama iz i-te sekvence u odnosu na profil matricu sastavljenu od motiva iz i-1 sekvenci.
import copy
def greedy_motif_search(dna_sequences, k, t): # t - broj sekvenci
best_motifs = [sequence[:k] for sequence in dna_sequences]
best_score = score(best_motifs, k, t)
first_string = dna_sequences[0]
for i in range(len(first_string) - k + 1):
motif1 = first_string[i:i + k]
motifs = [motif1]
for j in range(1, t):
profile = create_profile(motifs, k, t)
current_string = dna_sequences[j]
new_motif = most_probable_kmer(profile, current_string, k)
motifs.append(new_motif)
current_score = score(motifs, k, t)
if current_score < best_score:
best_score = current_score
# best_motifs = motifs
best_motifs = copy.deepcopy(motifs)
return best_motifs
dna_sequences = ['TTCGATCGTTTG',
'AGCGCTCGTTCG',
'CCCGTTTGATAG',
'AACGATTTTCG',
'AACGATCTTTCG']
k = 4
t = len(dna_sequences)
greedy_motif_search(dna_sequences, k, t)
Funkcija motifs_from_profile pronalazi najverovatnije k-grame iz zadatih sekvenci u odnosu na profil matricu i od njih formira skup motiva.
def motifs_from_profile(profile, dna_sequences, k, t):
motifs = []
for sequence in dna_sequences:
motifs.append(most_probable_kmer(profile, sequence, k))
return motifs
Funkcija randomized_motif_search pronalazi skup motiva dužine k iz niza od t DNK sekvenci dna_sequences polaskom od skupa motiva odabranih pseudoslučajnim pristupom i zatim iterativnim formiranjem profila od skupa motiva praćenim formiranjem novog skupa motiva u odnosu na dobijeni profil.
import random
def randomized_motif_search(dna_sequences, k, t):
best_motifs = []
for sequence in dna_sequences:
i = random.randrange(0, len(sequence) - k + 1)
best_motifs.append(sequence[i:i + k])
best_score = score(best_motifs, k, t)
motifs = copy.deepcopy(best_motifs)
while True:
profile = create_profile(motifs, k, t)
motifs = motifs_from_profile(profile, dna_sequences, k, t)
current_score = score(motifs, k, t)
if current_score < best_score:
best_score = current_score
best_motifs = copy.deepcopy(motifs)
else:
break
return best_motifs
dna_sequences = ['ttACCTtaac',
'gATGTctgtc',
'ccgGCGTtag',
'cactaACGAg',
'cgtcagAGGT']
k = 4
t = len(dna_sequences)
randomized_motif_search(dna_sequences, k, t)
def most_probable_gibbs_kmer(profile, dna_sequence, k):
kmers = []
probabilities = []
max_probability = float('-inf')
min_probability = float('inf')
for i in range(len(dna_sequence) - k + 1):
kmer = dna_sequence[i:i + k]
kmers.append(kmer)
kmer_probability = probability(profile, kmer)
probabilities.append(kmer_probability)
probability_sum = sum(probabilities)
random_point = random.random() * probability_sum
current_sum = 0
for i in range(len(probabilities)):
p = probabilities[i]
current_sum += p
if current_sum >= random_point:
return kmers[i]
Funkcija gibbs_sampler pronalazi skup motiva dužine k iz niza od t DNK sekvenci dna_sequences. Algoritam polazi od skupa motiva odabranih pseudoslučajnim pristupom a zatim iterativno uklanja i-ti motiva iz skupa motiva (za pseudoslučajno odabrani indeks i), formira profil od preostalih motifa i u skup motiva dodaje k-gram iz i-te sekvence odabran pristrasnim pseudoslučajnim izborom. Ukoliko je u nekoj iteraciji dobijen skup motiva sa boljim skorom od do sada pronadjenim najboljim, tekući skup motiva se postavlja za trenutno najbolji.
def gibbs_sampler(dna_sequences, k, t, N):
best_motifs = []
for sequence in dna_sequences:
i = random.randrange(0, len(sequence) - k + 1)
best_motifs.append(sequence[i:i + k])
best_score = score(best_motifs, k, t)
motifs = copy.deepcopy(best_motifs)
for i in range(N):
i = random.randrange(t)
del motifs[i]
profile = create_profile(motifs, k, t - 1)
new_motif_i = most_probable_gibbs_kmer(profile, dna_sequences[i], k)
motifs.insert(i, new_motif_i)
current_score = score(motifs, k, t)
if current_score < best_score:
best_score = current_score
best_motifs = copy.deepcopy(motifs)
return best_motifs
dna_sequences = ['ttACCTtaac',
'gATGTctgtc',
'ccgGCGTtag',
'cactaACGAg',
'cgtcagAGGT']
k = 4
t = len(dna_sequences)
N = 1000
gibbs_sampler(dna_sequences, k, t, N)