In [2]:
class TrieNode:
    def __init__(self, label):
        self.label = label
        self.neighbors = {}
        self.is_leaf = True
        
    def add_neighbor(self, character):
        self.is_leaf = False
        self.neighbors[character] = TrieNode(self.label + character)
        
    def has_neighbor(self, character):
        if character in self.neighbors:
            return True
        return False
    
    def get_neighbor(self, character):
        if self.has_neighbor(character):
            return self.neighbors[character]
        return None
    
    def get_neighbors(self):
        return self.neighbors
In [3]:
def trie_construction(patterns):
    root = TrieNode('')
    
    for pattern in patterns:
        current_node = root
        
        for current_symbol in pattern:
            if not current_node.has_neighbor(current_symbol):
                current_node.add_neighbor(current_symbol)
            current_node = current_node.get_neighbor(current_symbol)
    
    return root
In [4]:
patterns = ['ab','banana', 'ananas', 'cucumber']
trie_construction(patterns)
Out[4]:
<__main__.TrieNode at 0x10c850748>
In [5]:
def prefix_trie_matching(text, trie):
    current_node = trie
    for c in text:
        if current_node.is_leaf:
            return current_node.label
        if current_node.has_neighbor(c):
            current_node = current_node.get_neighbor(c)
        else:
            return False

    if current_node.is_leaf:
        return current_node.label
    else:
        return False
In [6]:
text = 'bananas'
patterns = ['ab', 'bananas', 'ananas', 'cucumber']
trie = trie_construction(patterns)

prefix_trie_matching(text, trie)
Out[6]:
'bananas'
In [7]:
def trie_matching(text, trie):
    results = []
    n = len(text)
    for i in range(n):
        match = prefix_trie_matching(text[i:], trie)
        if match:
            results.append((match, i))
    return results
In [8]:
text = 'babananas'
patterns = ['ab', 'bananas', 'ananas', 'cucumber']
trie = trie_construction(patterns)

trie_matching(text, trie)
Out[8]:
[('ab', 1), ('bananas', 2), ('ananas', 3)]
In [9]:
def generate_suffixes(text):
    suffixes = []
    n = len(text)
    for i in range(n):
        suffixes.append(text[i:])
        
    return suffixes
In [10]:
text = 'panamabananas$'

generate_suffixes(text)
Out[10]:
['panamabananas$',
 'anamabananas$',
 'namabananas$',
 'amabananas$',
 'mabananas$',
 'abananas$',
 'bananas$',
 'ananas$',
 'nanas$',
 'anas$',
 'nas$',
 'as$',
 's$',
 '$']
In [11]:
def generate_suffix_array(text):
    raw_suffixes = generate_suffixes(text)
    
    n = len(text)
    suffixes = list(zip(raw_suffixes, [i for i in range(n)]))
    
    suffixes.sort()
    return suffixes
In [12]:
text = 'panamabananas$'
generate_suffix_array(text)
Out[12]:
[('$', 13),
 ('abananas$', 5),
 ('amabananas$', 3),
 ('anamabananas$', 1),
 ('ananas$', 7),
 ('anas$', 9),
 ('as$', 11),
 ('bananas$', 6),
 ('mabananas$', 4),
 ('namabananas$', 2),
 ('nanas$', 8),
 ('nas$', 10),
 ('panamabananas$', 0),
 ('s$', 12)]
In [13]:
text = 'banana$'
suffixes = generate_suffixes(text)
trie_construction([x[0] for x in suffixes])
Out[13]:
<__main__.TrieNode at 0x10c8d4b38>
In [14]:
def is_prefix(pattern, text):
    n = len(pattern)
    m = len(text)
    
    if n > m:
        return False
    
    return text[:n] == pattern
In [15]:
is_prefix('ac', 'abbc')
Out[15]:
False
In [16]:
def pattern_matching_with_suffix_array(pattern, suffix_array):
    n = len(suffix_array)
    l = 0
    d = n
    
    while l <= d:
        mid = (l + d) // 2
        mid_suffix = suffix_array[mid][0]

        if is_prefix(pattern, mid_suffix):
            
            i = mid - 1
            j = mid + 1
            while i >= 0:
                if not is_prefix(pattern, suffix_array[i][0]):
                    i += 1
                    break
                i -= 1
                
            while j < n:
                if not is_prefix(pattern, suffix_array[j][0]):
                    break
                j += 1
                
            return [x[1] for x in suffix_array[i:j]]
        
        if mid_suffix < pattern:
            l = mid
            
        else:
            d = mid
    
    return []
In [17]:
pattern = 'na'
text = 'panamabananas$'

suffix_array = generate_suffix_array(text)

pattern_matching_with_suffix_array(pattern, suffix_array)
Out[17]:
[2, 8, 10]
In [18]:
def generate_cyclic_permutations(text):
    permutations = []
    
    n = len(text)
    
    for i in range(n):
        prefix = text[:(n-i)]
        suffix = text[(n-i):]
        permutations.append(suffix + prefix)
        
    return permutations
In [19]:
def bwt_construction(text):
    permutations = generate_cyclic_permutations(text)
    permutations.sort()
    bwt_text = ''.join([x[-1] for x in permutations])
    return bwt_text
In [21]:
text = 'banana$'
bwt_text = bwt_construction(text)
bwt_list = list(bwt_text)
bwt_list.sort()
bwt_first_column = ''.join(bwt_list)

print(bwt_first_column)
print(bwt_text)
$aaabnn
annb$aa
In [24]:
def inverse_bwt(bwt_text):
    n = len(bwt_text)
    last_column = list(bwt_text)
    columns = last_column[:]
    columns.sort()
    
    result_index = bwt_text.index('$')
    
    for i in range(n - 1):
        for j in range(n):
            columns[j] = last_column[j] + columns[j]
        
        columns.sort()
        
    return columns[result_index]
In [25]:
inverse_bwt('annb$aa')
Out[25]:
'banana$'
In [45]:
def last_to_first(first_column, last_column, index):
    char_at_index = last_column[index]
    
    n = len(last_column)
    
    rank = 0;
    
    for i in range(index+1):
        if last_column[i] == char_at_index:
            rank += 1
            
    first_column_rank = 0
        
    for i in range(n):
        if first_column[i] == char_at_index:
            first_column_rank += 1
        
        if first_column_rank == rank:
            return i
In [46]:
last_to_first('$aaabnn', 'annb$aa', 5)
Out[46]:
2
In [72]:
def bw_matching(first_column, last_column, pattern):
    top = 0
    bottom = len(first_column) - 1
    
    m = len(pattern)
    j = 0
    
    while top < bottom:
        print('top: {}, bottom: {}'.format(top, bottom))
        if j < m:
            symbol = pattern[j]
            j += 1
            
            top_index = -1
            bottom_index = -1
            
            for i in range(top,bottom + 1):
                if top_index == -1 and last_column[i] == symbol:
                    top_index = i
                    bottom_index = i
                
                elif last_column[i] == symbol:
                    bottom_index = i
                    
            if top_index == -1 or bottom_index == -1:
                return 0
                    
            top = last_to_first(first_column, last_column, top_index)
            bottom = last_to_first(first_column, last_column, bottom_index)
        else:
            break
            
    return  bottom - top + 1
        
In [73]:
word = 'panamabananas$'
pattern = 'ana'

last_column = bwt_construction(word)

first_column = list(last_column[:])
first_column.sort()

bw_matching(first_column, last_column, pattern)
top: 0, bottom: 13
top: 1, bottom: 6
top: 9, bottom: 11
top: 3, bottom: 5
Out[73]:
3