import numpy as np

def cal_similarity(A, B, s_match, s_mismatch):
    # A: sequence A
    # B: sequence B
    # s_match: score for a match
    # s_mismatch: score for a mismatch
    # return (s): similarity matrix

    str_a = list(A)
    str_b = list(B)
    nA = len(str_a)
    nB = len(str_b)
    s = np.zeros((nA, nB))
    for i in range(nA):
        for j in range(nB):
            if str_a[i] == str_b[j]:
                s[i,j] = s_match
            else:
                s[i,j] = s_mismatch
    return s

def global_scoring(s, gap):
    # s: matrix of n (length of A), m (length of B)
    # gap: gap penalty score
    # return (f): scores for all possible alignments

    nA, nB = s.shape
    f = np.zeros((nA+1, nB+1))
    f[:,0] = np.arange(nA+1) * gap
    f[0,:] = np.arange(nB+1) * gap
    for i in range(nA):
        for j in range(nB):
            match = f[i,j] + s[i,j]
            delete = f[i,j+1] + gap
            insert = f[i+1,j] + gap
            f[i+1,j+1] = max(match, delete, insert)
    return f

def global_traceback(A, B, f, s, gap):
    # A: sequence A
    # B: sequence B
    # f: score matrix for all possible alignments
    # s: similarity matrix
    # gap: gap penalty
    # return: a vector of two strings indicating the optimal alignment

    str_a = list(A)
    str_b = list(B)
    A_align = ""
    B_align = ""
    AB_match = ""

    i = len(str_a)
    j = len(str_b)
    while i > 0 or j > 0:
        if i > 0 and j > 0 and f[i,j] == f[i-1,j-1] + s[i-1,j-1]:
            A_align = str_a[i-1] + A_align
            B_align = str_b[j-1] + B_align
            if str_a[i-1] == str_b[j-1]:
                AB_match = "|" + AB_match
            else:
                AB_match = " " + AB_match
            i -= 1
            j -= 1
        elif i > 0 and f[i,j] == f[i-1,j] + gap:
            A_align = str_a[i-1] + A_align
            B_align = "-" + B_align
            AB_match = " " + AB_match
            i -= 1
        else:
            A_align = "-" + A_align
            B_align = str_b[j-1] + B_align
            AB_match = " " + AB_match
            j -= 1
    return [A_align, AB_match, B_align]

def global_align(A, B, s_match, s_mismatch, gap):
    # A: sequence A
    # B: sequence B
    # s_match: score for a match
    # s_mismatch: score for a mismatch
    # gap: gap penalty score
    s = cal_similarity(A, B, s_match, s_mismatch)
    f = global_scoring(s, gap)
    print(f)
    results = global_traceback(A, B, f, s, gap)
    return results

s_match = 1
s_mismatch = 0
gap = 0
A = "GAATTCAGTTA"
B = "GGATCGA"
results = global_align(A, B, s_match, s_mismatch, gap)
for r in results:
    print(r)

[[0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 1. 1. 1. 1. 1. 1.]
 [0. 1. 1. 2. 2. 2. 2. 2.]
 [0. 1. 1. 2. 2. 2. 2. 3.]
 [0. 1. 1. 2. 3. 3. 3. 3.]
 [0. 1. 1. 2. 3. 3. 3. 3.]
 [0. 1. 1. 2. 3. 4. 4. 4.]
 [0. 1. 1. 2. 3. 4. 4. 5.]
 [0. 1. 2. 2. 3. 4. 5. 5.]
 [0. 1. 2. 2. 3. 4. 5. 5.]
 [0. 1. 2. 2. 3. 4. 5. 5.]
 [0. 1. 2. 3. 3. 4. 5. 6.]]
GAATTCAGTTA
| | || |  |
GGA-TC-G--A

s_match = 3
s_mismatch = -3
gap = -2
A = "GAATTCAGTTA"
B = "GGATCGA"
results = global_align(A, B, s_match, s_mismatch, gap)
for r in results:
    print(r)

[[  0.  -2.  -4.  -6.  -8. -10. -12. -14.]
 [ -2.   3.   1.  -1.  -3.  -5.  -7.  -9.]
 [ -4.   1.   0.   4.   2.   0.  -2.  -4.]
 [ -6.  -1.  -2.   3.   1.  -1.  -3.   1.]
 [ -8.  -3.  -4.   1.   6.   4.   2.   0.]
 [-10.  -5.  -6.  -1.   4.   3.   1.  -1.]
 [-12.  -7.  -8.  -3.   2.   7.   5.   3.]
 [-14.  -9. -10.  -5.   0.   5.   4.   8.]
 [-16. -11.  -6.  -7.  -2.   3.   8.   6.]
 [-18. -13.  -8.  -9.  -4.   1.   6.   5.]
 [-20. -15. -10. -11.  -6.  -1.   4.   3.]
 [-22. -17. -12.  -7.  -8.  -3.   2.   7.]]
GAATTCAGTTA
| | || |  |
GGA-TC-G--A

def local_scoring(s, gap):
    # s: matrix of n (length of A), m (length of B)
    # gap: gap penalty score
    # return (h): scores for all possible alignments
    
    nA, nB = s.shape
    h = np.zeros((nA+1, nB+1))
    h[:,0] = 0
    h[0,:] = 0
    for i in range(1, nA+1):
        for j in range(1, nB+1):
            match = h[i-1,j-1] + s[i-1,j-1]
            delete = h[i-1,j] + gap
            insert = h[i,j-1] + gap
            h[i,j] = max(match, delete, insert, 0)
    return h

def local_traceback(A, B, h, s, gap):
    # A: sequence A
    # B: sequence B
    # h: score matrix for all possible alignments
    # s: similarity matrix
    # gap: gap penalty
    # return: a vector of two strings indicating the optimal alignment
    
    str_a = list(A)
    str_b = list(B)
    A_align = ""
    B_align = ""
    AB_match = ""
    
    i, j = np.unravel_index(np.argmax(h, axis=None), h.shape)
    while (i > 0 or j > 0) and h[i,j] > 0:
        if i > 0 and j > 0 and h[i,j] == h[i-1,j-1] + s[i-1,j-1]:
            A_align = str_a[i-1] + A_align
            B_align = str_b[j-1] + B_align
            if str_a[i-1] == str_b[j-1]:
                AB_match = "|" + AB_match
            else:
                AB_match = " " + AB_match
            i -= 1
            j -= 1
        elif i > 0 and h[i,j] == h[i-1,j] + gap:
            A_align = str_a[i-1] + A_align
            B_align = "-" + B_align
            AB_match = " " + AB_match
            i -= 1
        else:
            A_align = "-" + A_align
            B_align = str_b[j-1] + B_align
            AB_match = " " + AB_match
            j -= 1
    return [A_align, AB_match, B_align]

def local_align(A, B, s_match, s_mismatch, gap):
    # A: sequence A
    # B: sequence B
    # s_match: score for a match
    # s_mismatch: score for a mismatch
    # gap: gap penalty score
    s = cal_similarity(A, B, s_match, s_mismatch)
    h = local_scoring(s, gap)
    results = local_traceback(A, B, h, s, gap)
    return results

s_match = 3
s_mismatch = -3
gap = -2
A = "GAATTCAGTTA"
B = "GGATCGA"

# Global alignment
results_global = global_align(A, B, s_match, s_mismatch, gap)
for r in results_global:
    print(r)

# Local alignment
results_local = local_align(A, B, s_match, s_mismatch, gap)
for r in results_local:
    print(r)

[[  0.  -2.  -4.  -6.  -8. -10. -12. -14.]
 [ -2.   3.   1.  -1.  -3.  -5.  -7.  -9.]
 [ -4.   1.   0.   4.   2.   0.  -2.  -4.]
 [ -6.  -1.  -2.   3.   1.  -1.  -3.   1.]
 [ -8.  -3.  -4.   1.   6.   4.   2.   0.]
 [-10.  -5.  -6.  -1.   4.   3.   1.  -1.]
 [-12.  -7.  -8.  -3.   2.   7.   5.   3.]
 [-14.  -9. -10.  -5.   0.   5.   4.   8.]
 [-16. -11.  -6.  -7.  -2.   3.   8.   6.]
 [-18. -13.  -8.  -9.  -4.   1.   6.   5.]
 [-20. -15. -10. -11.  -6.  -1.   4.   3.]
 [-22. -17. -12.  -7.  -8.  -3.   2.   7.]]
GAATTCAGTTA
| | || |  |
GGA-TC-G--A
GAATTC-A
| | || |
G-A-TCGA

import pandas as pd
import matplotlib.pyplot as plt

# Read data from file
c10 = pd.read_table("./c10.txt", sep=r"\s+", header=None, names=["x", "y"])

# Plot data
plt.plot(c10["x"], c10["y"], marker="o")
plt.xlabel("x")
plt.ylabel("y")
plt.show()

def tsp_dp(n, d, x, z):
    # n: number of cities
    # d: distance matrix (n by n)
    # x: the last city
    # z: a vector of cities passing through
    # output: g: minimum distance; p: city order
    if len(z) == 0:
        g = d[0][x]
        pos = [0]
    else:
        g = float('inf')
        for i in z:
            z1 = [city for city in z if city != i]
            results_pre = tsp_dp(n, d, i, z1)
            dis = results_pre['g'] + d[i][x]
            if g > dis:
                g = dis
                pos = results_pre['pos'] + [i]
    return {'g': g, 'pos': pos}

# Calculate distance matrix
d = np.linalg.norm(c10.values[:, None, :] - c10.values, axis=-1)

n = 10
p = tsp_dp(n, d, 0, list(range(1, n)))['pos']

plt.plot(c10['x'], c10['y'], marker='o', linestyle='None', label='Cities')
plt.plot(c10['x'][p], c10['y'][p], marker='', linestyle='-', color='red', label='TSP Path')
plt.plot([c10['x'][p[-1]], c10['x'][p[0]]], [c10['y'][p[-1]], c10['y'][p[0]]], linestyle='-', color='red')

plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.show()

09B. Dynamic programming¶

Mingyang Lu¶

3/28/2024¶

Dynamic programming¶

Global sequence alignment¶

Local sequence alignment¶

Travelling salesman problem (TSP)¶