# Gabriel Valiente

## Combinatorial Pattern Matching Algorithms in Computational Biology using Perl and R

Here is the source code for Combinatorial Pattern Matching Algorithms in Computational Biology using Perl and R, by Gabriel Valiente. It has been tested using Perl version 5.8.6 (BioPerl version 1.5) and R version 2.7.1 (seqinr version 1.1-7 and APE version 2.1-3).

### Chapter 1: Introduction

#### Section 1.3: A Motivating Example: Gene Prediction

 sub extract_open_reading_frames { my \$seq = shift; for my \$r (0,1,2) { for (my \$i = \$r; \$i <= length(\$seq)-3; \$i += 3) { if (substr(\$seq,\$i,3) eq "ATG") { my \$j = \$i+3; while (\$j <= length(\$seq)-3 && substr(\$seq,\$j,3) ne "TAA" && substr(\$seq,\$j,3) ne "TAG" && substr(\$seq,\$j,3) ne "TGA") { \$j += 3; } if (\$j <= length(\$seq)-3) { my \$len = \$j+2-\$i+1; if (\$len >= 30) { print substr(\$seq,\$i,\$j+2-\$i+1),"\n"; } } } } } } extract.open.reading.frames <- function (seq) { for (i in 1:3) { while (i+2 <= nchar(seq)) { if (substr(seq,i,i+2) == "ATG") { j <- i + 3 while (j+2 <= nchar(seq) && substr(seq,j,j+2) != "TAA" && substr(seq,j,j+2) != "TAG" && substr(seq,j,j+2) != "TGA") { j <- j + 3 } if (j+2 <= nchar(seq)) { if (j+2-i+1 >= 30) { print(c(i,j+2,substr(seq,i,j+2))) } } } i <- i + 3 } } } sub longest_open_reading_frame { my \$seq = shift; my (\$ii,\$jj) = (0,0); for my \$r (0,1,2) { for (my \$i = \$r; \$i <= length(\$seq)-3; \$i += 3) { if (substr(\$seq,\$i,3) eq "ATG") { my \$j = \$i+3; while (\$j <= length(\$seq)-3 && substr(\$seq,\$j,3) ne "TAA" && substr(\$seq,\$j,3) ne "TAG" && substr(\$seq,\$j,3) ne "TGA") { \$j += 3; } if (\$j <= length(\$seq)-3) { my \$len = \$j+2-\$i+1; if (\$j+2-\$i+1 > \$jj+2-\$ii+1) { \$ii = \$i; \$jj = \$j; } } } } } return [\$ii,\$jj+2]; } longest.open.reading.frame <- function (seq) { ii <- jj <- 0 for (i in 1:3) { while (i+2 <= nchar(seq)) { if (substr(seq,i,i+2) == "ATG") { j <- i + 3 while (j+2 <= nchar(seq) && substr(seq,j,j+2) != "TAA" && substr(seq,j,j+2) != "TAG" && substr(seq,j,j+2) != "TGA") { j <- j + 3 } if (j+2 <= nchar(seq)) { if (j+2-i+1 > jj+2-ii+1) { ii <- i jj <- j } } } i <- i + 3 } } c(ii,jj+2) }

### Chapter 2: Sequences

#### Section 2.2: Sequences in Computer Science

##### Section 2.2.1: Traversing Labeled Sequences
 sub seq_to_labeled_seq { my \$seq = shift; my %seq; for (my \$i = 0; \$i < length(\$seq); \$i++) { my \$elem = substr(\$seq,\$i,1); \$seq{\$elem}++; } return \%seq; } seq.to.labeled.seq <- function (seq) { alphabet <- sort(unique(strsplit(seq,"")[[1]])) lab <- matrix(0,nrow=length(alphabet),ncol=1) dimnames(lab) <- list(alphabet,"count") for (i in 1:nchar(seq)) lab[substr(seq,i,i),] <- lab[substr(seq,i,i),] + 1 lab } sub symmetric_difference { my \$seq1 = shift; my \$seq2 = shift; my %seq1 = %{ seq_to_labeled_seq(\$seq1) }; my %seq2 = %{ seq_to_labeled_seq(\$seq2) }; for my \$elem (keys %seq2) { \$seq1{\$elem} = abs(\$seq1{\$elem}-\$seq2{\$elem}); } return \%seq1; } symmetric.difference <- function (seq1,seq2) { lab1 <- seq.to.labeled.seq(seq1) lab2 <- seq.to.labeled.seq(seq2) for (elem in row.names(lab2)) lab1[elem,] <- abs(lab1[elem,] - lab2[elem,]) lab1 }

#### Section 2.3: Sequences in Computational Biology

##### Section 2.3.1: Reverse Complementing DNA Sequences
 sub reverse_complement { my \$seq = shift; \$seq = reverse \$seq; \$seq =~ tr/ACGT/TGCA/; return \$seq; } reverse.complement <- function (seq) { rev <- paste(rev(unlist(strsplit(seq,split=""))),sep="",collapse="") chartr("ACGT","TGCA",rev) }
##### Section 2.3.2: Counting RNA Sequences
 sub count { my \$n = shift; my @R; \$R[0] = \$R[1] = \$R[2] = 1; for (my \$i = 2; \$i < \$n; \$i++) { \$R[\$i+1] = \$R[\$i]; for (my \$j = 1; \$j < \$i; \$j++) { \$R[\$i+1] += \$R[\$j-1] * \$R[\$i-\$j] } } return \$R[\$n]; } count <- function (n) { R <- rep(0,n) R[1:3] <- 1 if (n > 2) { for (i in 2:(n-1)) { R[i+2] <- R[i+1] for (j in 1:(i-1)) { R[i+2] <- R[i+2] + R[j] * R[i-j+1] } } } R[n+1] }
##### Section 2.3.3: Generating DNA Sequences
 sub words { my \$n = shift; my \$alphabet = shift; my @n = split "", \$alphabet; if (\$n == 1) { return \@n; } else { my \$short = words(\$n - 1, \$alphabet); my @long; for my \$seq (@\$short) { for my \$n (@n) { push @long, \$seq . \$n; } } return \@long; } } library(seqinr) words(length=3,alphabet=s2c("ACGT"))

### Chapter 3: Simple Pattern Matching in Sequences

#### Section 3.1: Finding Words in Sequences

##### Section 3.1.1: Word Composition of Sequences
 sub word_composition { my \$seq = shift; my \$k = shift; my \$alphabet = shift; my \$words = words(\$k,\$alphabet); my %freq; for my \$word (@\$words) { \$freq{\$word} = 0; } for my \$i (0 .. length(\$seq)-\$k) { my \$word = substr(\$seq,\$i,\$k); \$freq{\$word}++; } return \%freq; } library(seqinr) count(s2c(seq),word=3,alphabet=s2c("ACGT"))
##### Section 3.1.2: Alignment Free Comparison of Sequences
 library(seqinr) alignment.free.distance <- function (seq1,seq2,k,sigma) { freq1 <- count(seq1,word=k,alphabet=s2c(sigma)) freq2 <- count(seq2,word=k,alphabet=s2c(sigma)) cor(freq1,freq2) } sub mean { my \$a = shift; my \$res; foreach (@\$a) { \$res += \$_ } \$res /= @\$a; } sub sd { my \$a = shift; my \$mean = mean(\$a); return sqrt( mean( [map \$_ ** 2, @\$a] ) - (\$mean ** 2) ); } sub cov { my \$a1 = shift; my \$a2 = shift; my \$res; foreach (0 .. @\$a1 - 1) { \$res += \$a1->[\$_] * \$a2->[\$_]; } \$res /= @\$a1; \$res -= mean(\$a1) * mean(\$a2); } sub cor { my \$a1 = shift; my \$a2 = shift; return cov(\$a1,\$a2) / (sd(\$a1) * sd(\$a2)); }

### Chapter 4: General Pattern Matching in Sequences

#### Section 4.1: Finding Subsequences

 sub occurrences { my \$p = shift; my \$t = shift; my \$m = length \$p; my \$n = length \$t; my @L; for (my \$i = 0; \$i < \$n-\$m+1; \$i++) { if (\$p eq substr(\$t,\$i,\$m)) { push @L, \$i; } } return \@L; } occurrences <- function (p,t) { L <- c() for (i in 1:nchar(t)-nchar(p)+1) if (p == substr(t,i,i+nchar(p)-1)) L <- c(L,i) L }

#### Section 4.1.1: Suffix Arrays

 sub suffix_array { my \$s = shift; my @sa = (0 .. length(\$s)-1); @sa = sort {substr(\$s,\$a) cmp substr(\$s,\$b)} @sa; return \@sa; } suffix.array <- function (seq) { sa <- apply(matrix(1:nchar(seq)),1,function(i)substr(seq,i,nchar(seq))) order(sa) } sub occurrences { my \$p = shift; my \$t = shift; my \$sa = shift; my (\$l, \$r) = (0, \$#sa); while (\$l <= \$r) { my \$i = int((\$l + \$r)/2); my \$c = \$p cmp substr(\$t, \$sa[\$i], length \$p); if (\$c < 0) { \$r = \$i - 1; } elsif (\$c > 0) { \$l = \$i + 1; } else { my \$j = \$i; while (\$i > 0 && \$p eq substr(\$t, \$sa[\$i-1], length \$p)) { \$i--; } while (\$j < \$#sa-1 && \$p eq substr(\$t, \$sa[\$j+1], length \$p)) { \$j++; } return (\$i, \$j); } } return (-1, -1); } occurrences <- function (p,t,sa) { ell <- 1 r <- nchar(t) while (ell <= r) { i <- as.integer((ell + r) / 2) s <- substr(t,sa[i],sa[i]+nchar(p)-1) if (p < s) { r <- i - 1 } else if (p > s) { ell <- i + 1 } else { j <- i while (i > 1 && p==substr(t,sa[i-1],sa[i-1]+nchar(p)-1)) { i <- i - 1 } while (j < nchar(t) && p==substr(t,sa[j+1],sa[j+1]+nchar(p)-1)) { j <- j + 1 } return(c(i,j)) } } return(c(-1,-1)) } sub lcp { my (\$p, \$i, \$t, \$j) = @_; my \$c = 0; while (\$i+\$c < length \$p && \$j+\$c < length \$t && substr(\$p,\$i+\$c,1) eq substr(\$t,\$j+\$c,1)) { \$c++; } return \$c; } lcp <- function (seq1,pos1,seq2,pos2) { n1 <- nchar(seq1) n2 <- nchar(seq2) c <- 0 while (pos1+c <= n1 && pos2+c <= n2 && substr(seq1,pos1+c,pos1+c) == substr(seq2,pos2+c,pos2+c)) { c <- c + 1 } c } sub occurrences { my \$p = shift; my \$t = shift; my \$sa = shift; my (\$l, \$r) = (0, \$#sa); my (\$llcp, \$rlcp) = (0, 0); while (\$l <= \$r) { my \$i = int((\$l + \$r)/2); my \$h = \$llcp; \$h = \$rlcp if \$rlcp < \$h; my \$c = lcp(\$p,\$h,\$t,\$sa[\$i]+\$h); if (\$h+\$c == length \$p) { my \$j = \$i; while (\$i > 0 && \$p eq substr(\$t, \$sa[\$i-1], length \$p)) { \$i--; } while (\$j < \$#sa-1 && \$p eq substr(\$t, \$sa[\$j+1], length \$p)) { \$j++; } return (\$i, \$j); } elsif (substr(\$p,\$h+\$c,1) lt substr(\$t,\$sa[\$i]+\$h+\$c,1)) { \$r = \$i - 1; \$rlcp = \$h + \$c; } else { \$l = \$i + 1; \$llcp = \$h + \$c; } } return (-1, -1); } occurrences <- function (p,t,sa) { ell <- 1 r <- nchar(t) llcp <- 0 rlcp <- 0 while (ell <= r) { i <- as.integer((ell + r) / 2) h <- min(llcp,rlcp) c <- lcp(p,h+1,t,sa[i]+h) if (h+c == nchar(p)) { j <- i while (i > 1 && p==substr(t,sa[i-1],sa[i-1]+nchar(p)-1)) { i <- i - 1 } while (j < nchar(t) && p==substr(t,sa[j+1],sa[j+1]+nchar(p)-1)) { j <- j + 1 } return(c(i,j)) } else if (substr(p,h+c+1,h+c+1) < substr(t,sa[i]+h+c,sa[i]+h+c)) { r <- i - 1 rlcp <- h + c } else { ell <- i + 1 llcp <- h + c } } return(c(-1,-1)) }

#### Section 4.2: Finding Common Subsequences

 sub common_occurrences { my \$p = shift; my \$t1 = shift; my \$t2 = shift; my \$m = length \$p; my \$n1 = length \$t1; my \$n2 = length \$t2; my @L; for (my \$i = 0; \$i < \$n1-\$m+1; \$i++) { if (\$p eq substr(\$t1,\$i,\$m)) { for (my \$j = 0; \$j < \$n2-\$m+1; \$j++) { if (\$p eq substr(\$t2,\$j,\$m)) { push @L, \$i, \$j; } } } } return \@L; } common.occurrences <- function (p,t1,t2) { L <- c() for (i in 1:nchar(t1)-nchar(p)+1) if (p == substr(t1,i,i+nchar(p)-1)) for (j in 1:nchar(t2)-nchar(p)+1) if (p == substr(t2,j,j+nchar(p)-1)) L <- c(L,i,j) L } sub longest_common_subsequences { my \$s1 = shift; my \$s2 = shift; my \$n1 = length(\$s1); my \$n2 = length(\$s2); my @L; my \$l = 0; for (my \$i = 0; \$i < \$n1; \$i++) { for (my \$j = \$i; \$j < \$n1; \$j++) { my \$seq = substr(\$s1,\$i,\$j-\$i+1); for (my \$k = 0; \$k < \$n2-\$j+\$i; \$k++) { if (\$seq eq substr(\$s2,\$k,\$j-\$i+1)) { if (length(\$seq) == \$l) { push @L, \$seq; } elsif (length(\$seq) > \$l) { @L = \$seq; \$l = length(\$seq); } } } } } my %count; for my \$seq (@L) { \$count{\$seq}++; } @L = sort keys %count; return \@L; } longest.common.subsequences <- function (s1,s2) { L <- c() ell <- 0 for (i in 1:nchar(s1)) { for (j in i:nchar(s1)) { seq <- substr(s1,i,j) for (k in 1:(nchar(s2)-j+i)) { if (seq == substr(s2,k,k+j-i)) { if (nchar(seq) == ell) { L <- c(L,seq) } else if (nchar(seq) > ell) { L <- c(seq) ell <- nchar(seq) } } } } } unique(L) } sub longest_common_subsequences { my \$s1 = shift; my \$s2 = shift; my @L; my \$l = 0; my @LCS; for (my \$i = 0; \$i < length \$s1; \$i++ ) { for (my \$j = 0; \$j < length \$s2; \$j++ ) { \$LCS[\$i][\$j] = 0; if (substr(\$s1,\$i,1) eq substr(\$s2,\$j,1)) { if (\$i == 0 || \$j == 0) { \$LCS[\$i][\$j] = 1; } else { \$LCS[\$i][\$j] = \$LCS[\$i-1][\$j-1] + 1; } if (\$LCS[\$i][\$j] > \$l) { \$l = \$LCS[\$i][\$j]; @L = (); } if (\$LCS[\$i][\$j] == \$l) { push @L, substr(\$s1,\$i-\$l+1,\$l); } } } } return @L; } longest.common.subsequences <- function (s1,s2) { L <- c() ell <- 0 LCS <- matrix(0,nrow=nchar(s1),ncol=nchar(s2)) for (i in 1:nchar(s1)) { for (j in 1:nchar(s2)) { if (substr(s1,i,i) == substr(s2,j,j)) { if (i == 1 || j == 1) { LCS[i,j] <- 1 } else { LCS[i,j] <- LCS[i-1,j-1] + 1 } if (LCS[i,j] > ell) { ell <- LCS[i,j] L <- c() } if (LCS[i,j] == ell) { L <- c(L,substr(s1,i-ell+1,i)) } } } } L }

#### Section 4.2.1: Generalized Suffix Arrays

 sub generalized_suffix_array { my \$seq = shift; my %seq = %{\$seq}; my @gsa; for my \$id (keys %seq) { for my \$i (0 .. length(\$seq{\$id})-1) { push @gsa, [\$i, \$id]; } } @gsa = sort {substr(\$seq{@{\$a}[1]}, @{\$a}[0]) cmp substr(\$seq{@{\$b}[1]}, @{\$b}[0])} @gsa; return \@gsa; } generalized.suffix.array <- function (s1,s2) { sa1 <- apply(matrix(1:nchar(s1)),1, function (i) substr(s1,i,nchar(s1))) sa2 <- apply(matrix(1:nchar(s2)),1, function (i) substr(s2,i,nchar(s2))) arr <- order(c(sa1,sa2)) tag <- sapply(arr,function (i) if (i <= nchar(s1)) { 1 } else { 2 } ) arr <- sapply(arr,function (i) if (i <= nchar(s1)) { i } else { i-nchar(s1) } ) cbind(arr,tag) } sub common_occurrences { my \$p = shift; my \$seq = shift; my %seq = %{\$seq}; my \$gsa = shift; my @gsa = @{\$gsa}; my (\$l, \$r) = (0, \$#gsa); while (\$l <= \$r) { my \$i = int((\$l + \$r)/2); my \$c = \$p cmp substr(\$seq{\$gsa[\$i][1]}, \$gsa[\$i][0], length \$p); if (\$c < 0) { \$r = \$i - 1; } elsif (\$c > 0) { \$l = \$i + 1; } else { my \$j = \$i; while (\$i > 0 && \$p eq substr(\$seq{\$gsa[\$i-1][1]}, \$gsa[\$i-1][0], length \$p)) { \$i--; } while (\$j < \$#gsa && \$p eq substr(\$seq{\$gsa[\$j+1][1]}, \$gsa[\$j+1][0], length \$p)) { \$j++; } return (\$i, \$j); } } return (-1, -1); } common.occurrences <- function (p,seq,gsa) { ell <- 1 r <- n <- nchar(seq[1]) + nchar(seq[2]) while (ell <= r) { i <- as.integer((ell + r) / 2) s <- substr(seq[gsa[i,2]], gsa[i,1],gsa[i,1]+nchar(p)-1) if (p < s) { r <- i - 1 } else if (p > s) { ell <- i + 1 } else { j <- i while (i > 1 && p == s) { s <- substr(seq[gsa[i-1,2]], gsa[i-1,1],gsa[i-1,1]+nchar(p)-1) if (p == s) i <- i - 1 } s <- p while (j < n && p == s) { s <- substr(seq[gsa[j+1,2]], gsa[j+1,1],gsa[j+1,1]+nchar(p)-1) if (p == s) j <- j + 1 } return(c(i,j)) } } return(c(-1,-1)) } sub common_occurrences { my \$p = shift; my \$seq = shift; my %seq = %{\$seq}; my \$gsa = shift; my @gsa = @{\$gsa}; my (\$l, \$r) = (0, \$#gsa); my (\$llcp, \$rlcp) = (0, 0); while (\$l <= \$r) { my \$i = int((\$l + \$r)/2); my \$h = \$llcp; \$h = \$rlcp if \$rlcp < \$h; my \$c = lcp(\$p,\$h,\$seq{\$gsa[\$i][1]},\$gsa[\$i][0]+\$h); if (\$h+\$c == length \$p) { my \$j = \$i; while (\$i > 0 && \$p eq substr(\$seq{\$gsa[\$i-1][1]}, \$gsa[\$i-1][0], length \$p)) { \$i--; } while (\$j < \$#gsa-1 && \$p eq substr(\$seq{\$gsa[\$j+1][1]}, \$gsa[\$j+1][0], length \$p)) { \$j++; } return (\$i, \$j); } elsif (substr(\$p,\$h+\$c,1) lt substr(\$seq{\$gsa[\$i][1]},\$gsa[\$i][0]+\$h+\$c,1)) { \$r = \$i - 1; \$rlcp = \$h + \$c; } else { \$l = \$i + 1; \$llcp = \$h + \$c; } } return (-1, -1); } common.occurrences <- function (p,seq,gsa) { ell <- 1 r <- n <- nchar(seq[1]) + nchar(seq[2]) llcp <- 0 rlcp <- 0 while (ell <= r) { i <- as.integer((ell + r) / 2) s <- substr(seq[gsa[i,2]],gsa[i,1], gsa[i,1]+nchar(p)-1) h <- min(llcp,rlcp) c <- lcp(p,h+1,s,1+h) if (h+c == nchar(p)) { j <- i while (i > 1 && p == s) { s <- substr(seq[gsa[i-1,2]],gsa[i-1,1], gsa[i-1,1]+nchar(p)-1) if (p == s) i <- i - 1 } s <- p while (j < n && p == s) { s <- substr(seq[gsa[j+1,2]],gsa[j+1,1], gsa[j+1,1]+nchar(p)-1) if (p == s) j <- j + 1 } return(c(i,j)) } else { s <- substr(seq[gsa[i,2]],gsa[i,1]+h+c, gsa[i,1]+h+c) if (substr(p,h+c+1,h+c+1) < s) { r <- i - 1 rlcp <- h + c } else { ell <- i + 1 llcp <- h + c } } } return(c(-1,-1)) } sub longest_common_subsequences { my \$seq = shift; my %seq = %{\$seq}; my \$gsa = shift; my @gsa = @{\$gsa}; my @L; my \$l = 0; my \$f2 = substr(\$seq{\$gsa[0][1]}, \$gsa[0][0], length(\$seq{\$gsa[0][1]})); for (my \$i = 1; \$i < length(\$seq{1})+length(\$seq{2}); \$i++) { my \$f1 = \$f2; \$f2 = substr(\$seq{\$gsa[\$i][1]}, \$gsa[\$i][0], length(\$seq{\$gsa[\$i][1]})); if (\$gsa[\$i][1] != \$gsa[\$i-1][1]) { my \$p = lcp(\$f1,0,\$f2,0); if (\$p > \$l) { \$l = \$p; @L = (); } if (\$p == \$l) { push @L, substr(\$seq{\$gsa[\$i][1]},\$gsa[\$i][0],\$p); } } } return \@L; } longest.common.subsequences <- function (seq,gsa) { L <- c() ell <- 0 suf2 <- substr(seq[gsa[1,2]],gsa[1,1], nchar(seq[gsa[1,2]])) for (i in 2:(nrow(gsa))) { suf1 <- suf2 suf2 <- substr(seq[gsa[i,2]],gsa[i,1], nchar(seq[gsa[i,2]])) if (gsa[i-1,2] != gsa[i,2]) { pref <- substr(suf1,1,lcp(suf1,1,suf2,1)) if (nchar(pref) > ell) { ell <- nchar(pref) L <- c() } if (nchar(pref) == ell) { L <- c(L,pref) } } } L }

#### Section 4.3.1: Edit Distance Based Comparison of Sequences

 sub hamming_distance { my \$s1 = shift; my \$s2 = shift; my \$d = -1; if (length \$s1 == length \$s2) { \$d = 0; for (my \$i = 0; \$i < length \$s1; \$i++) { \$d++ if substr(\$s1,\$i,1) ne substr(\$s2,\$i,1); } } return \$d; } hamming.distance <- function (s1,s2) { if (nchar(s1) != nchar(s2)) -1 else sum(strsplit(s1,"")[[1]] != strsplit(s2,"")[[1]]) } sub levenshtein_distance { my \$s1 = shift; my \$s2 = shift; my (\$n1, \$n2) = (length \$s1, length \$s2); my @d; for ( my \$i = 0; \$i <= \$n1; \$i++ ) { \$d[\$i][0] = \$i; } for ( my \$j = 0; \$j <= \$n2; \$j++ ) { \$d[0][\$j] = \$j; } my @t1 = split //, \$s1; my @t2 = split //, \$s2; for ( my \$i = 1; \$i <= \$n1; \$i++ ) { for ( my \$j = 1; \$j <= \$n2; \$j++ ) { if (\$t1[\$i-1] eq \$t2[\$j-1]) { \$d[\$i][\$j] = \$d[\$i-1][\$j-1]; } else { \$d[\$i][\$j] = \$d[\$i-1][\$j] + 1; if (\$d[\$i][\$j-1] + 1 < \$d[\$i][\$j]) { \$d[\$i][\$j] = \$d[\$i][\$j-1] + 1; } } } } return \@d; } levenshtein.distance <- function(s1,s2) { t1 <- strsplit(s1,split="")[[1]] t2 <- strsplit(s2,split="")[[1]] n1 <- length(t1) n2 <- length(t2) d <- array(0,dim=c(n1+1,n2+1)) d[,1] <- 0:n1 d[1,] <- 0:n2 for (i in 2:(n1+1)) for (j in 2:(n2+1)) if (t1[i-1] == t2[j-1]) d[i,j] <- d[i-1,j-1] else d[i,j] <- min(d[i-1,j]+1, d[i,j-1]+1) d } sub edit_distance { my \$s1 = shift; my \$s2 = shift; my (\$n1, \$n2) = (length \$s1, length \$s2); my @d; for ( my \$i = 0; \$i <= \$n1; \$i++ ) { \$d[\$i][0] = \$i; } for ( my \$j = 0; \$j <= \$n2; \$j++ ) { \$d[0][\$j] = \$j; } my @t1 = split //, \$s1; my @t2 = split //, \$s2; for ( my \$i = 1; \$i <= \$n1; \$i++ ) { for ( my \$j = 1; \$j <= \$n2; \$j++ ) { \$d[\$i][\$j] = \$d[\$i-1][\$j] + 1; if (\$d[\$i][\$j-1] + 1 < \$d[\$i][\$j]) { \$d[\$i][\$j] = \$d[\$i][\$j-1] + 1; } if (\$t1[\$i-1] eq \$t2[\$j-1]) { if (\$d[\$i-1][\$j-1] < \$d[\$i][\$j]) { \$d[\$i][\$j] = \$d[\$i-1][\$j-1]; } } else { if (\$d[\$i-1][\$j-1] + 1 < \$d[\$i][\$j]) { \$d[\$i][\$j] = \$d[\$i-1][\$j-1] + 1; } } } } return \@d; } edit.distance <- function (s1,s2) { t1 <- strsplit(s1,split="")[[1]] t2 <- strsplit(s2,split="")[[1]] n1 <- length(t1) n2 <- length(t2) d <- array(0,dim=c(n1+1,n2+1)) d[,1] <- 0:n1 d[1,] <- 0:n2 for (i in 2:(n1+1)) { for (j in 2:(n2+1)) { d[i,j] <- min(d[i-1,j]+1, d[i,j-1]+1) if (t1[i-1] == t2[j-1]) d[i,j] <- min(d[i,j], d[i-1,j-1]) else d[i,j] <- min(d[i,j], d[i-1,j-1]+1) } } d }

#### Section 4.3.2: Alignment Based Comparison of Sequences

 sub alignment { my \$s1 = shift; my \$s2 = shift; my \$d = shift; my @d = @{\$d}; my (\$t1, \$t2) = ("", ""); my @t1 = split //, \$s1; my @t2 = split //, \$s2; my (\$n1, \$n2) = (length \$s1, length \$s2); my (\$i, \$j) = (\$n1, \$n2); while (\$i > 0 && \$j > 0) { if (\$d[\$i][\$j] eq \$d[\$i][\$j-1]+1) { \$t1 = "-" . \$t1; \$t2 = \$t2[\$j-1] . \$t2; \$j--; } elsif (\$d[\$i][\$j] eq \$d[\$i-1][\$j]+1) { \$t1 = \$t1[\$i-1] . \$t1; \$t2 = "-" . \$t2; \$i--; } else { \$t1 = \$t1[\$i-1] . \$t1; \$t2 = \$t2[\$j-1] . \$t2; \$i--; \$j--; } } while (\$j > 0) { \$t1 = "-" . \$t1; \$t2 = \$t2[\$j-1] . \$t2; \$j--; } while (\$i > 0) { \$t1 = \$t1[\$i-1] . \$t1; \$t2 = "-" . \$t2; \$i--; } print "\$t1\n\$t2\n"; } alignment <- function (s1,s2,d) { t1 <- strsplit(s1,split="")[[1]] t2 <- strsplit(s2,split="")[[1]] a1 <- a2 <- c() i <- length(t1)+1 j <- length(t2)+1 while (i > 1 && j > 1) { if (d[i,j] == d[i,j-1]+1) { a1 <- c("-",a1) a2 <- c(t2[j-1],a2) j <- j-1 } else if (d[i,j] == d[i-1,j]+1) { a1 <- c(t1[i-1],a1) a2 <- c("-",a2) i <- i-1 } else { a1 <- c(t1[i-1],a1) a2 <- c(t2[j-1],a2) i <- i-1 j <- j-1 } } while (j > 1) { a1 <- c("-",a1) a2 <- c(t2[j-1],a2) j <- j-1 } while (i > 1) { a1 <- c(t1[i-1],a1) a2 <- c("-",a2) i <- i-1 } print(paste(a1,collapse="")) print(paste(a2,collapse="")) } sub local_alignment { my (\$s1, \$s2, \$match, \$mismatch, \$gap) = @_; my (\$n1, \$n2) = ( length \$s1, length \$s2); my @s; for (my \$i = 0; \$i <= \$n1; \$i++) { \$s[\$i][0] = 0; } for (my \$j = 0; \$j <= \$n2; \$j++) { \$s[0][\$j] = 0; } my @t1 = split //, \$s1; my @t2 = split //, \$s2; for ( my \$i = 1; \$i <= \$n1; \$i++ ) { for ( my \$j = 1; \$j <= \$n2; \$j++ ) { if (\$t1[\$i-1] eq \$t2[\$j-1]) { \$s[\$i][\$j] = \$s[\$i-1][\$j-1] + \$match; } else { \$s[\$i][\$j] = \$s[\$i-1][\$j-1] + \$mismatch; } \$s[\$i][\$j] = \$s[\$i-1][\$j] + \$gap if \$s[\$i-1][\$j] + \$gap > \$s[\$i][\$j]; \$s[\$i][\$j] = \$s[\$i][\$j-1] + \$gap if \$s[\$i][\$j-1] + \$gap > \$s[\$i][\$j]; \$s[\$i][\$j] = 0 if 0 > \$s[\$i][\$j]; } } my \$max = 0; my @opt; for ( my \$i = 1; \$i <= \$n1; \$i++ ) { for ( my \$j = 1; \$j <= \$n2; \$j++ ) { if (\$s[\$i][\$j] > \$max) { \$max = \$s[\$i][\$j]; @opt = (); } if (\$s[\$i][\$j] == \$max) { \$max = \$s[\$i][\$j]; push @opt, \$i, \$j; } } } while (@opt) { my \$i = shift @opt; my \$j = shift @opt; my (\$t1, \$t2) = ("", ""); while (\$s[\$i][\$j] != 0) { if ((\$t1[\$i-1] eq \$t2[\$j-1] && \$s[\$i][\$j] eq \$s[\$i-1][\$j-1] + \$match) || (\$t1[\$i-1] ne \$t2[\$j-1] && \$s[\$i][\$j] eq \$s[\$i-1][\$j-1] + \$mismatch)) { \$t1 = \$t1[\$i-1] . \$t1; \$t2 = \$t2[\$j-1] . \$t2; \$i--; \$j--; } elsif (\$s[\$i][\$j] eq \$s[\$i-1][\$j] + \$gap) { \$t1 = \$t1[\$i-1] . \$t1; \$t2 = "-" . \$t2; \$i--; } elsif (\$s[\$i][\$j] eq \$s[\$i][\$j-1] + \$gap) { \$t1 = "-" . \$t1; \$t2 = \$t2[\$j-1] . \$t2; \$j--; } } print "\$t1\n\$t2\n"; } } local.alignment <- function (s1,s2,match,mismatch,gap) { t1 <- strsplit(s1,split="")[[1]] t2 <- strsplit(s2,split="")[[1]] n1 <- length(t1) n2 <- length(t2) s <- array(0,dim=c(n1+1,n2+1)) s[,1] <- 0 s[1,] <- 0 for (i in 2:(n1+1)) { for (j in 2:(n2+1)) { if (t1[i-1] == t2[j-1]) s[i,j] <- s[i-1,j-1] + match else s[i,j] <- s[i-1,j-1] + mismatch s[i,j] <- max( s[i,j], s[i-1,j] + gap, s[i,j-1] + gap, 0) } } opt <- which(s==max(s),arr.ind=TRUE) for (sol in 1:nrow(opt)) { i <- opt[sol,1] j <- opt[sol,2] m1 <- c() m2 <- c() while (s[i,j] != 0) { if ((t1[i-1] == t2[j-1] && s[i,j] == s[i-1,j-1] + match) || (t1[i-1] != t2[j-1] && s[i,j] == s[i-1,j-1] + mismatch)) { m1 <- c(t1[i-1],m1) m2 <- c(t2[j-1],m2) i <- i - 1 j <- j - 1 } else if (s[i,j] == s[i-1,j] + gap) { m1 <- c(t1[i-1],m1) m2 <- c("-",m2) i <- i - 1 } else if (s[i,j] == s[i,j-1] + gap) { m1 <- c("-",m1) m2 <- c(t2[j-1],m2) j <- j - 1 } } print(paste(m1,collapse="")) print(paste(m2,collapse="")) } }

### Chapter 6: Simple Pattern Matching in Trees

#### Section 6.1: Finding Paths in Unrooted Trees

##### Section 6.1.2: The Partition Distance between Unrooted Trees
 sub partition { my \$tree = shift; my %partition = (); for my \$node (\$tree->get_nodes) { next if \$node->is_Leaf; # discard terminal nodes next unless \$node->ancestor; # discard root my @a = sort { \$a cmp \$b } map { \$_->id } grep {\$_->is_Leaf } \$node->get_all_Descendents; my \$aa = join ",", @a; my @b = (); my %count = (); foreach my \$label (@a, map { \$_->id } \$tree->get_leaf_nodes) { \$count{\$label}++ } foreach my \$label (keys %count) { push @b, \$label if \$count{\$label} == 1; } my \$bb = join ",", sort { \$a cmp \$b } @b; my \$p = \$aa le \$bb ? "\$aa:\$bb" : "\$bb:\$aa"; \$partition{\$p} = 1; } my @partition = sort keys %partition; return \@partition; } sub partition_distance { my \$tree1 = shift; my \$tree2 = shift; my @partition1 = @{ partition(\$tree1) }; my @partition2 = @{ partition(\$tree2) }; my \$dist = 0; my %count = (); foreach my \$p (@partition1, @partition2) { \$count{\$p}++ } foreach my \$p (keys %count) { \$dist++ if \$count{\$p} == 1; } return \$dist; } library(ape) dist.topo(t1,t2)
##### Section 6.1.3: The Nodal Distance between Unrooted Trees
 sub distances { my \$tree = shift; my @leaves = sort {\$a->id cmp \$b->id} \$tree->get_leaf_nodes; my @labels = map { \$_->id } @leaves; my \$n = scalar @labels; my @dist; for my \$i (1..\$n-1) { my @nodes = \$tree->find_node(-id => \$labels[\$i-1]); for my \$j (\$i+1..\$n) { push @nodes, \$tree->find_node(-id => \$labels[\$j-1]); push @dist, \$tree->distance(-nodes => \@nodes); pop @nodes; } } return \@dist; } sub nodal_distance { my \$tree1 = shift; my \$tree2 = shift; my @dist1 = @{ distances(\$tree1) }; my @dist2 = @{ distances(\$tree2) }; my (\$dist, \$d1, \$d2); while (@dist1 and @dist2) { \$d1 = pop @dist1; \$d2 = pop @dist2; \$dist += abs(\$d1-\$d2); } return \$dist; } library(ape) nodal.distance <- function (t1,t2) { m1<-cophenetic.phylo(t1) m1<-m1[order(rownames(m1)),order(colnames(m1))] n1<-m1[upper.tri(m1)] m2<-cophenetic.phylo(t2) m2<-m2[order(rownames(m2)),order(colnames(m2))] n2<-m2[upper.tri(m2)] sum(abs(n1-n2)) }

### Chapter 7: General Pattern Matching in Trees

#### Section 7.1: Finding Subtrees

##### Section 7.1.1: Finding Subtrees Induced by Triplets
 sub triplet { my \$tree = shift; my \$i = shift; my \$j = shift; my \$k = shift; my @ij = grep { \$_->id =~ /\$i|\$j/ } \$tree->get_leaf_nodes; my @ik = grep { \$_->id =~ /\$i|\$k/ } \$tree->get_leaf_nodes; my @jk = grep { \$_->id =~ /\$j|\$k/ } \$tree->get_leaf_nodes; my \$ij = \$tree->get_lca(-nodes => \@ij); my \$ik = \$tree->get_lca(-nodes => \@ik); my \$jk = \$tree->get_lca(-nodes => \@jk); my \$str; if (\$ik == \$jk) { \$str = "((\$i,\$j),\$k);"; } else { if (\$ij == \$jk) { \$str = "((\$i,\$k),\$j);"; } else { \$str = "((\$j,\$k),\$i);"; } } return \$str; } library(ape) triplet <- function (t,i,j,k) { ij <- mrca(t)[i,j] ik <- mrca(t)[i,k] jk <- mrca(t)[j,k] if (ik == jk) paste("((",i,",",j,"),",k,");",sep="") else if (ij == jk) paste("((",i,",",k,"),",j,");",sep="") else paste("((",j,",",k,"),",i,");",sep="") }
##### Section 7.1.2: Finding Subtrees Induced by Quartets
 use Clone qw(clone); sub quartet { my (\$tree,\$i,\$j,\$k,\$l) = @_; my \$copy = clone \$tree; map { \$copy->remove_Node(\$_) } grep { !(\$_->id =~ /\$i|\$j|\$k|\$l/) } \$copy->get_leaf_nodes; return \$copy; } library(ape) quartet <- function (t,i,j,k,l) { unroot(drop.tip(t,setdiff(t\$tip.label,c(i,j,k,l)))) }

#### Section 7.2: Finding Common Subtrees

##### Section 7.2.1: Maximum Agreement of Rooted Trees
 parent <- function (tree,node) tree\$edge[which(tree\$edge[,2]==node),1] children <- function (tree,node) tree\$edge[which(tree\$edge[,1]==node),2] is.root <- function (tree,node) length(tree\$tip.label)+1==node is.ancestor <- function(tree,anc,des) { if (anc==des) return(TRUE) if (is.root(tree,des)) return(FALSE) else return(is.ancestor(tree,anc,parent(tree,des))) } postorder <- function (tree) { r <- length(tree\$tip.label)+1 postorder.traversal(tree,r,c()) } postorder.traversal <- function (tree,v,res) { for (w in t1\$edge[which(tree\$edge[,1]==v),2]) res <- postorder.traversal(tree,w,res) c(res,v) } sub mast_size { my \$tree1 = shift; my \$tree2 = shift; my (@m, \$m); for my \$node1 (@{ \$tree1->Bio::Tree::Compatible::postorder_traversal }) { my \$n1 = \$node1->internal_id; for my \$node2 (@{ \$tree2->Bio::Tree::Compatible::postorder_traversal }) { my \$n2 = \$node2->internal_id; if (\$node1->is_Leaf) { if (\$node2->is_Leaf) { \$m = (\$node1->id eq \$node2->id ? 1 : 0); } else { my \$label1 = \$node1->id; \$m = (grep { /\$label1/ } map { \$_->id } grep { \$_->is_Leaf } \$node2->get_all_Descendents) ? 1 : 0; } } else { if (\$node2->is_Leaf) { my \$label2 = \$node2->id; \$m = (grep { /\$label2/ } map { \$_->id } grep { \$_->is_Leaf } \$node1->get_all_Descendents) ? 1 : 0; } else { my (\$l1,\$r1) = map { \$_->internal_id } \$node1->each_Descendent; my (\$l2,\$r2) = map { \$_->internal_id } \$node2->each_Descendent; \$m = \$m[\$l1][\$l2] + \$m[\$r1][\$r2]; if (\$m[\$l1][\$r2] + \$m[\$r1][\$l2] > \$m) { \$m = \$m[\$l1][\$r2] + \$m[\$r1][\$l2]; } if (\$m[\$n1][\$l2] > \$m) { \$m = \$m[\$n1][\$l2]; } if (\$m[\$n1][\$r2] > \$m) { \$m = \$m[\$n1][\$r2]; } if (\$m[\$l1][\$n2] > \$m) { \$m = \$m[\$l1][\$n2]; } if (\$m[\$r1][\$n2] > \$m) { \$m = \$m[\$r1][\$n2]; } } } \$m[\$n1][\$n2] = \$m; } } return \$m; } mast.size <- function (t1,t2) { po1 <- postorder(t1) po2 <- postorder(t2) m <- matrix(0,nrow=length(po1),ncol=length(po2),dimnames=list(po1,po2)) for (i in po1) { for (j in po2) { if (length(t1\$edge[which(t1\$edge[,1]==i),1])==0) { if (length(t2\$edge[which(t2\$edge[,1]==j),1])==0) { if (t1\$tip.label[i]==t2\$tip.label[j]) m[i,j] <- 1 } else { jj <- which(t2\$tip.label==t1\$tip.label[i]) if (is.ancestor(t2,j,jj)) m[i,j] <- 1 } } else { if (length(t2\$edge[which(t2\$edge[,1]==j),1])==0) { ii <- which(t1\$tip.label==t2\$tip.label[j]) if (is.ancestor(t1,i,ii)) m[i,j] <- 1 } else { l1 <- children(t1,i)[1] r1 <- children(t1,i)[2] l2 <- children(t2,j)[1] r2 <- children(t2,j)[2] m[i,j] <- max(m[l1,l2]+m[r1,r2],m[l1,r2]+m[r1,l2], m[i,l2],m[i,r2],m[l1,j],m[r1,j]) } } } } m <- m[po1,po2] dimnames(m) <- list(po1,po2) m } sub mast { my \$tree1 = shift; my \$tree2 = shift; my @m; for my \$node1 (@{ \$tree1->Bio::Tree::Compatible::postorder_traversal }) { my \$n1 = \$node1->internal_id; for my \$node2 (@{ \$tree2->Bio::Tree::Compatible::postorder_traversal }) { my \$n2 = \$node2->internal_id; my @mm = (); if (\$node1->is_Leaf) { if (\$node2->is_Leaf) { if (\$node1->id eq \$node2->id) { @mm = (\$node1->id); } } else { my \$label1 = \$node1->id; if (grep { /\$label1/ } map { \$_->id } grep { \$_->is_Leaf } \$node2->get_all_Descendents) { @mm = (\$label1); } } } else { if (\$node2->is_Leaf) { my \$label2 = \$node2->id; if (grep { /\$label2/ } map { \$_->id } grep { \$_->is_Leaf } \$node1->get_all_Descendents) { @mm = (\$label2); } } else { my (\$l1,\$r1) = map { \$_->internal_id } \$node1->each_Descendent; my (\$l2,\$r2) = map { \$_->internal_id } \$node2->each_Descendent; @mm = @{\$m[\$l1][\$l2]}; push @mm, @{\$m[\$r1][\$r2]}; if (scalar(@{\$m[\$l1][\$r2]})+scalar(@{\$m[\$r1][\$l2]})>scalar(@mm)) { @mm = @{\$m[\$l1][\$r2]}; push @mm, @{\$m[\$r1][\$l2]}; } if (scalar(@{\$m[\$n1][\$l2]})>scalar(@mm)) { @mm = @{\$m[\$n1][\$l2]}; } if (scalar(@{\$m[\$n1][\$r2]})>scalar(@mm)) { @mm = @{\$m[\$n1][\$r2]}; } if (scalar(@{\$m[\$l1][\$n2]})>scalar(@mm)) { @mm = @{\$m[\$l1][\$n2]}; } if (scalar(@{\$m[\$r1][\$n2]})>scalar(@mm)) { @mm = @{\$m[\$r1][\$n2]}; } } } \$m[\$n1][\$n2] = \@mm; } } return \@m; } mast <- function (t1,t2) { po1 <- postorder(t1) po2 <- postorder(t2) m <- matrix(rep(list(),length(po1)),nrow=length(po1),ncol=length(po2), dimnames=list(po1,po2)) for (i in po1) { for (j in po2) { mm <- list() if (length(t1\$edge[which(t1\$edge[,1]==i),1])==0) { if (length(t2\$edge[which(t2\$edge[,1]==j),1])==0) { if (t1\$tip.label[i]==t2\$tip.label[j]) mm <- c(t1\$tip.label[i]) } else { jj <- which(t2\$tip.label==t1\$tip.label[i]) if (is.ancestor(t2,j,jj)) mm <- c(t1\$tip.label[i]) } } else { if (length(t2\$edge[which(t2\$edge[,1]==j),1])==0) { ii <- which(t1\$tip.label==t2\$tip.label[j]) if (is.ancestor(t1,i,ii)) mm <- c(t2\$tip.label[j]) } else { l1 <- children(t1,i)[1] r1 <- children(t1,i)[2] l2 <- children(t2,j)[1] r2 <- children(t2,j)[2] mm <- c(m[[l1,l2]],m[[r1,r2]]) if ( length(m[[l1,r2]]) + length(m[[r1,l2]]) > length(mm) ) mm <- c(m[[l1,r2]],m[[r1,l2]]) if ( length(m[[i,l2]]) > length(mm) ) mm <- m[[i,l2]] if ( length(m[[i,r2]]) > length(mm) ) mm <- m[[i,r2]] if ( length(m[[l1,j]]) > length(mm) ) mm <- m[[l1,j]] if ( length(m[[r1,j]]) > length(mm) ) mm <- m[[r1,j]] } } m[[i,j]] <- mm } } unlist(m[[length(t1\$tip.label)+1,length(t2\$tip.label)+1]]) }

#### Section 7.3: Comparing Trees

##### Section 7.3.1: The Triplets Distance between Rooted Trees
 sub triplets_distance { my \$tree1 = shift; my \$tree2 = shift; my @leaves = sort {\$a->id cmp \$b->id} \$tree1->get_leaf_nodes; my @labels = map { \$_->id } @leaves; my \$dist = 0; for (my \$i = 0; \$i < @leaves; \$i++) { for (my \$j = \$i+1; \$j < @leaves; \$j++) { for (my \$k = \$j+1; \$k < @leaves; \$k++) { my \$t1 = triplet(\$tree1,\$labels[\$i],\$labels[\$j],\$labels[\$k]); my \$t2 = triplet(\$tree2,\$labels[\$i],\$labels[\$j],\$labels[\$k]); \$dist += 2 unless (\$t1 eq \$t2); } } } return \$dist; } triplets.distance <- function (t1,t2) { L <- sort(t1\$tip.label) d <- 0 for (i in L[1:(length(L)-2)]) { for (j in L[(match(i,L)+1):(length(L)-1)]) { for (k in L[(match(j,L)+1):length(L)]) { str1 <- triplet(t1,i,j,k) str2 <- triplet(t2,i,j,k) if (str1 != str2) { d <- d + 2 } } } } d }
##### Section 7.3.2: The Quartets Distance between Unrooted Trees
 sub quartets_distance { my \$tree1 = shift; my \$tree2 = shift; my @leaves = sort {\$a->id cmp \$b->id} \$tree1->get_leaf_nodes; my @labels = map { \$_->id } @leaves; my \$dist = 0; for (my \$i = 0; \$i < @leaves; \$i++) { for (my \$j = \$i+1; \$j < @leaves; \$j++) { for (my \$k = \$j+1; \$k < @leaves; \$k++) { for (my \$l = \$k+1; \$l < @leaves; \$l++) { my \$q1 = quartet(\$tree1, \$labels[\$i],\$labels[\$j],\$labels[\$k],\$labels[\$l]); my \$q2 = quartet(\$tree2, \$labels[\$i],\$labels[\$j],\$labels[\$k],\$labels[\$l]); if (partition_distance(\$q1,\$q2) != 0) { \$dist += 2; } } } } } return \$dist; } quartets.distance <- function (t1,t2) { L <- sort(t1\$tip.label) d <- 0 for (i in L[1:(length(L)-3)]) { for (j in L[(match(i,L)+1):(length(L)-2)]) { for (k in L[(match(j,L)+1):(length(L)-1)]) { for (l in L[(match(k,L)+1):length(L)]) { q1 <- quartet(t1,i,j,k,l) q2 <- quartet(t2,i,j,k,l) if (dist.topo(q1,q2) != 0) { d <- d + 2 } } } } } d }

### Chapter 9: Simple Pattern Matching in Graphs

#### Section 9.1: Finding Paths in Graphs

 network.height <- function (net) { V(net)\$height <- 0 V(net)\$visited <- FALSE Q <- as.vector(V(net)[which(degree(net,mode="out")==0)-1]) while (length(Q)>0) { j <- Q[1] Q <- Q[-1] V(net)[j]\$visited <- TRUE I <- as.vector(V(net)[nei(j,mode="in")]) for (i in I) { V(net)[i]\$height <- max(V(net)[i]\$height,V(net)[j]\$height+1) if (all(V(net)[nei(i,mode="out")]\$visited)) Q <- c(Q,i) } } V(net)\$height }
##### Section 9.1.1: Distances in Graphs
 use Bio::PhyloNetwork; sub is_strict_ancestor { my \$net = shift; my \$u = shift; my \$v = shift; my \$dag = \$net->graph->copy_graph; my @roots = \$dag->source_vertices; my \$root = shift @roots; \$dag->delete_vertex(\$u); return !\$dag->is_reachable(\$root,\$v); } library(igraph) is.strict.ancestor <- function (net,i,j) { if (length(get.all.shortest.paths(net, V(net)[i],V(net)[j],mode="out")) == 0) return(FALSE) r <- V(net)[which(degree(net,mode="in")==0)-1] if (i == r || i == j) return(TRUE) net <- delete.vertices(net,i) if (i < j) j <- j - 1 # account for deleted node length(get.all.shortest.paths(net,r,V(net)[j],mode="out")) == 0 } is.ancestor <- function (net,i,j) { length(get.all.shortest.paths(net,V(net)[i],V(net)[j],mode="out")) != 0 } sub CSA { my \$net = shift; my \$v = shift; my \$w = shift; my \$dag = \$net->graph; my @csa = (); foreach my \$u ( \$net->nodes ) { if ( \$dag->is_reachable(\$u,\$v) and \$dag->is_reachable(\$u,\$w) ) { if (is_strict_ancestor(\$net,\$u,\$v) or is_strict_ancestor(\$net,\$u,\$w)) { push @csa, \$u; } } } return \@csa; } CSA <- function (net,i,j) { strict <- lapply(V(net),function (u) is.ancestor(net,u,i) && is.ancestor(net,u,j) && (is.strict.ancestor(net,u,i) || is.strict.ancestor(net,u,j))) V(net)[unlist(strict)] } sub LCSA { my \$net = shift; my \$v = shift; my \$w = shift; my %height = \$net->heights; my @CSA = @{ CSA(\$net,\$v,\$w) }; my \$lcsa = shift @CSA; for my \$node ( @CSA ) { if ( \$height{\$node} < \$height{\$lcsa} ) { \$lcsa = \$node; } } return \$lcsa; } LCSA <- function (net,i,j) { csa <- match(CSA(net,i,j),V(net)) V(net)[csa[which.min(network.height(net)[csa])]-1] }
##### Section 9.1.2: The Path Multiplicity Distance between Graphs
 path.multiplicity <- function (net) { leaves <- V(net)[which(degree(net,mode="out")==0)-1] mu <- matrix(0,nrow=vcount(net),ncol=length(leaves), dimnames=list(sort(V(net)\$name),sort(leaves\$name))) for (v in sort(V(net)[leaves]\$name)) mu[v,v] <- 1 V(net)\$visited <- FALSE Q <- as.vector(V(net)[which(degree(net,mode="out")==0)-1]) while (length(Q)>0) { j <- Q[1] Q <- Q[-1] V(net)[j]\$visited <- TRUE I <- as.vector(V(net)[nei(j,mode="in")]) for (i in I) { mu[V(net)[i]\$name,] <- mu[V(net)[i]\$name,] + mu[V(net)[j]\$name,] if (all(V(net)[nei(i,mode="out")]\$visited)) Q <- c(Q,i) } } mu } lex.cmp <- function(vec1,vec2) { index <- which.min(vec1 == vec2) as.numeric(sign(vec1[index] - vec2[index])) } path.multiplicity.distance <- function (net1,net2) { mu1 <- path.multiplicity(net1) mu1 <- mu1[do.call("order",lapply(1:ncol(mu1),function (j) mu1[,j])),] mu2 <- path.multiplicity(net2) mu2 <- mu2[do.call("order",lapply(1:ncol(mu2),function (j) mu2[,j])),] i1 <- 1 i2 <- 1 c <- 0 while (i1 <= nrow(mu1) && i2 <= nrow(mu2)) { lex <- lex.cmp(mu1[i1,],mu2[i2,]) if ( lex == -1 ) { i1 <- i1 + 1 } else if ( lex == 1 ) { i2 <- i2 + 1 } else { i1 <- i1 + 1 i2 <- i2 + 1 c <- c + 1 } } nrow(mu1)+nrow(mu2)-2*c }
##### Section 9.1.3: The Tripartition Distance between Graphs
 tripartitions <- function (net) { leaves <- V(net)[which(degree(net,mode="out")==0)-1] theta <- matrix("",nrow=vcount(net),ncol=length(leaves), dimnames=list(sort(V(net)\$name),sort(leaves\$name))) for (i in V(net)) { for (j in leaves) { if ( is.ancestor(net,i,j) ) { if ( is.strict.ancestor(net,i,j) ) { theta[V(net)[i]\$name,V(net)[j]\$name] <- "A" } else { theta[V(net)[i]\$name,V(net)[j]\$name] <- "B" } } else { theta[V(net)[i]\$name,V(net)[j]\$name] <- "C" } } } theta } use Bio::PhyloNetwork; sub tripartition_distance { my \$net1 = shift; my \$net2 = shift; \$net1->compute_tripartitions() unless defined \$net1->{tripartitions}; \$net2->compute_tripartitions() unless defined \$net2->{tripartitions}; my @tri1 = sort map {\$net1->{tripartitions}->{\$_}} \$net1->nodes; my @tri2 = sort map {\$net2->{tripartitions}->{\$_}} \$net2->nodes; my \$i1 = 0; my \$i2 = 0; my \$c = 0; while (\$i1 < scalar @tri1 && \$i2 < scalar @tri2) { if (\$tri1[\$i1] lt \$tri2[\$i2]) { \$i1++; } elsif (\$tri1[\$i1] gt \$tri2[\$i2]) { \$i2++; } else { \$i1++; \$i2++; \$c++; } } return scalar @tri1+scalar @tri2-2*\$c; } library(igraph) lex.cmp.char <- function (vec1,vec2) { for (j in 1:length(vec1)) { if (vec1[j] < vec2[j]) { return(-1) } if (vec1[j] > vec2[j]) { return(1) } } return(0) } tripartition.distance <- function (net1,net2) { theta1 <- tripartitions(net1) theta1 <- theta1[do.call("order",lapply(1:ncol(theta1), function (j) theta1[,j])),] theta2 <- tripartitions(net2) theta2 <- theta2[do.call("order",lapply(1:ncol(theta2), function (j) theta2[,j])),] i1 <- 1 i2 <- 1 c <- 0 while (i1 <= nrow(theta1) && i2 <= nrow(theta2)) { lex <- lex.cmp.char(theta1[i1,],theta2[i2,]) if ( lex == -1 ) { i1 <- i1 + 1 } else if ( lex == 1 ) { i2 <- i2 + 1 } else { i1 <- i1 + 1 i2 <- i2 + 1 c <- c + 1 } } nrow(theta1)+nrow(theta2)-2*c }
##### Section 9.1.4: The Nodal Distance between Graphs
 sub nodal_distance { my \$net1 = shift; my \$net2 = shift; my @L = \$net1->leaves; my \$dist = 0; for (my \$i = 0; \$i < @L; \$i++) { for (my \$j = \$i+1; \$j < @L; \$j++) { my \$lcsa1 = LCSA(\$net1,\$L[\$i],\$L[\$j]); my \$lcsa2 = LCSA(\$net2,\$L[\$i],\$L[\$j]); my @p1 = \$net1->graph->SP_Dijkstra(\$lcsa1,\$L[\$i]); my @p2 = \$net2->graph->SP_Dijkstra(\$lcsa2,\$L[\$i]); my \$d1 = @p1 - 1; my \$d2 = @p2 - 1; \$dist += abs(\$d1 - \$d2); @p1 = \$net1->graph->SP_Dijkstra(\$lcsa1,\$L[\$j]); @p2 = \$net2->graph->SP_Dijkstra(\$lcsa2,\$L[\$j]); \$d1 = @p1 - 1; \$d2 = @p2 - 1; \$dist += abs(\$d1 - \$d2); } } return \$dist; } library(igraph) nodal.distance <- function (net1,net2) { l1 <- V(net1)[which(degree(net1,mode="out")==0)-1] n1 <- matrix(0,nrow=length(l1),ncol=length(l1), dimnames=list(sort(l1\$name),sort(l1\$name))) l2 <- V(net2)[which(degree(net2,mode="out")==0)-1] n2 <- matrix(0,nrow=length(l2),ncol=length(l2), dimnames=list(sort(l2\$name),sort(l2\$name))) for (i in l1) { for (j in c(l1)[match(i,l1):length(l1)]) { lcsa <- LCSA(net1,i,j) paths <- get.shortest.paths(net1,lcsa,i,mode="out") n1[V(net1)[i]\$name,V(net1)[j]\$name] <- length(paths[[1]])-1 paths <- get.shortest.paths(net1,lcsa,j,mode="out") n1[V(net1)[j]\$name,V(net1)[i]\$name] <- length(paths[[1]])-1 } } for (i in l2) { for (j in c(l2)[match(i,l2):length(l2)]) { lcsa <- LCSA(net2,i,j) paths <- get.shortest.paths(net2,lcsa,i,mode="out") n2[V(net2)[i]\$name,V(net2)[j]\$name] <- length(paths[[1]])-1 paths <- get.shortest.paths(net2,lcsa,j,mode="out") n2[V(net2)[j]\$name,V(net2)[i]\$name] <- length(paths[[1]])-1 } } sum(abs(n1-n2)) }

#### Section 9.2: Finding Trees in Graphs

 sub explode_rec { my (\$self,\$trees) = @_; my @h = \$self->hybrid_nodes; if (scalar @h) { my \$v = shift @h; for my \$u (\$self->{graph}->predecessors(\$v)) { \$self->{graph}->delete_edge(\$u,\$v); \$self->explode_rec(\$trees); \$self->{graph}->add_edge(\$u,\$v); } } else { my \$io = IO::String->new(\$self->eNewick); my \$treeio = Bio::TreeIO->new(-format => 'newick', -fh => \$io); my \$tree = \$treeio->next_tree; \$tree->contract_linear_paths; push @{\$trees}, \$tree; } } sub explode { my (\$self) = @_; my @trees; \$self->explode_rec(\@trees); return @trees; } network.to.newick <- function (net) { r <- V(net)[which(degree(net,mode="in")==0)-1] newick <- obtain.newick(net,r,c()) paste(c(newick,";"),collapse="") } obtain.newick <- function (net,v,newick) { if (length(V(net)[nei(v,"out")]) != 0) { if ( length(V(net)[nei(v,"in")]) != 0 && match(v,V(net)[nei(V(net)[nei(v,"in")],"out")]) != 1 ) newick <- c(newick,",") newick <- c(newick,"(") } for (w in V(net)[nei(v,"out")]) { newick <- obtain.newick(net,w,newick) } if (length(V(net)[nei(v,"out")]) != 0) newick <- c(newick,")") if ( length(V(net)[nei(v,"in")]) != 0 && length(V(net)[nei(v,"out")]) == 0 && match(v,V(net)[nei(V(net)[nei(v,"in")],"out")]) != 1 ) newick <- c(newick,",") c(newick,V(net)[v]\$name) } network.to.tree <- function (net) { V(net)\$del <- FALSE E(net)\$del <- FALSE r <- V(net)[which(degree(net,mode="in")==0)-1] net <- postorder.traversal(net,r) net <- delete.vertices(net,V(net)[del]) net <- delete.edges(net,E(net)[del]) network.to.newick(net) } postorder.traversal <- function (net,v) { for (w in V(net)[nei(v,"out")]) { net <- postorder.traversal(net,w) } VW <- E(net)[from == v & del==FALSE] if (length(c(VW)) == 1) { u <- V(net)[nei(v,"in")] w <- get.edge(net,VW)[2] E(net,path=c(V(net)[u],V(net)[v]))\$del <- TRUE E(net,path=c(V(net)[v],V(net)[w]))\$del <- TRUE V(net)[v]\$del <- TRUE net <- add.edges(net,c(V(net)[u],V(net)[w])) E(net,path=c(V(net)[u],V(net)[w]))\$del <- FALSE } net } explode <- function (net) { trees <- c() H <- V(net)[which(degree(net,mode="in")>1)-1] if (length(H) == 0) { trees <- c(trees,network.to.tree(net)) } else { v <- c(H)[1] for (u in V(net)[nei(v,"in")]) { new <- delete.edges(net,E(net,P=c(u,v))) trees <- c(trees,explode(new)) } } trees }
##### Section 9.2.1: The Statistical Error between Graphs
 sub error_ratio { my \$net1 = shift; my \$net2 = shift; my @trees1 = \$net1->explode; my @trees2 = \$net2->explode; my \$FN = 0; my \$FP = 0; for my \$tree1 (@trees1) { my \$found = 0; # false for my \$tree2 (@trees2) { if (partition_distance(\$tree1,\$tree2) == 0) { \$found = 1; # true } } if (!\$found) { \$FN++; } } for my \$tree2 (@trees2) { my \$found = 0; # false for my \$tree1 (@trees1) { if (partition_distance(\$tree1,\$tree2) == 0) { \$found = 1; # true } } if (!\$found) { \$FP++; } } return ((\$FN/@trees1)+(\$FP/@trees2))/2; } error.rate <- function (net1,net2) { trees1 <- explode(net1) trees2 <- explode(net2) FN <- 0 FP <- 0 for (t1 in trees1) { tree1 <- read.tree(text=t1) found <- FALSE for (t2 in trees2) { tree2 <- read.tree(text=t2) if (dist.topo(tree1,tree2) == 0) { found <- TRUE break } } if (!found) FN <- FN+1 } for (t2 in trees2) { tree2 <- read.tree(text=t2) found <- FALSE for (t1 in trees1) { tree1 <- read.tree(text=t1) if (dist.topo(tree1,tree2) == 0) { found <- TRUE break } } if (!found) FP <- FP+1 } (FN/length(trees1)+FP/length(trees2))/2 }

### Chapter 10: General Pattern Matching in Graphs

#### Section 10.1: Finding Subgraphs

##### Section 10.1.1: Finding Subgraphs Induced by Triplets
 sub triplet { my (\$net,@triple) = @_; my (\$i,\$j,\$k) = @triple; my @L = \$net->leaves; my \$ij = LCSA(\$net,\$L[\$i],\$L[\$j]); my \$ik = LCSA(\$net,\$L[\$i],\$L[\$k]); my \$jk = LCSA(\$net,\$L[\$j],\$L[\$k]); my \$t; if (\$net->graph->SP_Dijkstra(\$ij,\$ik)) { if (\$net->graph->SP_Dijkstra(\$ik,\$ij)) { \$t = "((\$L[\$j],\$L[\$k]),\$L[\$i]);"; } else { if (\$net->graph->SP_Dijkstra(\$jk,\$ij)) { \$t = "((\$L[\$i],\$L[\$k]),\$L[\$j]);"; } else { \$t = "((\$L[\$i],(\$L[\$k])#H1),(#H1,\$L[\$j]));"; } } } else { if (\$net->graph->SP_Dijkstra(\$ik,\$ij)) { if (\$net->graph->SP_Dijkstra(\$jk,\$ij)) { \$t = "((\$L[\$i],\$L[\$j]),\$L[\$k]);"; } else { \$t = "((\$L[\$i],(\$L[\$j])#H1),(#H1,\$L[\$k]));"; } } else { \$t = "((\$L[\$j],(\$L[\$i])#H1),(#H1,\$L[\$k]));"; } } return \$t; } triplet <- function (net,i,j,k) { ij <- LCSA(net,i,j) ik <- LCSA(net,i,k) jk <- LCSA(net,j,k) ij.ancestor.of.ik <- length(get.all.shortest.paths(net, ij,ik,mode="out")) != 0 ik.ancestor.of.ij <- length(get.all.shortest.paths(net, ik,ij,mode="out")) != 0 ij.ancestor.of.jk <- length(get.all.shortest.paths(net, ij,jk,mode="out")) != 0 jk.ancestor.of.ij <- length(get.all.shortest.paths(net, jk,ij,mode="out")) != 0 if (ij.ancestor.of.ik) if (ik.ancestor.of.ij) paste("((",V(net)[j]\$name,",",V(net)[k]\$name,"),", V(net)[i]\$name,");",sep="") else if (jk.ancestor.of.ij) paste("((",V(net)[i]\$name,",",V(net)[k]\$name,"),", V(net)[j]\$name,");",sep="") else paste("((",V(net)[i]\$name,",(",V(net)[k]\$name,"))#H,(#H,", V(net)[j]\$name,"));",sep="") else if (ik.ancestor.of.ij) if (jk.ancestor.of.ij) paste("((",V(net)[i]\$name,",",V(net)[j]\$name,"),", V(net)[k]\$name,");",sep="") else paste("((",V(net)[i]\$name,",(",V(net)[j]\$name,"))#H,(#H,", V(net)[k]\$name,"));",sep="") else paste("((",V(net)[j]\$name,",(",V(net)[i]\$name,"))#H,(#H,", V(net)[k]\$name,"));",sep="") } sub triplet { my (\$net,@triple) = @_; my (\$i,\$j,\$k) = @triple; my @L = \$net->leaves; my \$ij = LCSA(\$net,\$L[\$i],\$L[\$j]); my \$ik = LCSA(\$net,\$L[\$i],\$L[\$k]); my \$jk = LCSA(\$net,\$L[\$j],\$L[\$k]); my @R; \$R[0] = 1; \$R[1] = \$net->graph->SP_Dijkstra(\$ij,\$ik) ? 1 : 0; \$R[2] = \$net->graph->SP_Dijkstra(\$ij,\$jk) ? 1 : 0; \$R[3] = \$net->graph->SP_Dijkstra(\$ik,\$ij) ? 1 : 0; \$R[4] = 1; \$R[5] = \$net->graph->SP_Dijkstra(\$ik,\$jk) ? 1 : 0; \$R[6] = \$net->graph->SP_Dijkstra(\$jk,\$ij) ? 1 : 0; \$R[7] = \$net->graph->SP_Dijkstra(\$jk,\$ik) ? 1 : 0; \$R[8] = 1; return join "", @R; } triplet <- function (net,i,j,k) { ij <- LCSA(net,i,j) ik <- LCSA(net,i,k) jk <- LCSA(net,j,k) L <- c(V(net)[i]\$name,V(net)[j]\$name,V(net)[k]\$name) R <- matrix(rep(NA,9),nrow=3,dimnames=list(L,L)) R[1,1] <- TRUE R[1,2] <- length(get.all.shortest.paths(net,ij,ik,mode="out")) != 0 R[1,3] <- length(get.all.shortest.paths(net,ij,jk,mode="out")) != 0 R[2,1] <- length(get.all.shortest.paths(net,ik,ij,mode="out")) != 0 R[2,2] <- TRUE R[2,3] <- length(get.all.shortest.paths(net,ik,jk,mode="out")) != 0 R[3,1] <- length(get.all.shortest.paths(net,jk,ij,mode="out")) != 0 R[3,2] <- length(get.all.shortest.paths(net,jk,ik,mode="out")) != 0 R[3,3] <- TRUE R }

#### Section 10.2: Finding Common Subgraphs

##### Section 10.2.1: Maximum Agreement of Rooted Networks
 sub bottom_up_subgraph { my \$graph = shift; my \$node = shift; my \$sub = Graph->new; my @Q = (\$node); while (@Q) { \$node = shift @Q; \$sub->add_vertex(\$node); push @Q, \$graph->successors(\$node); } for my \$edge (\$graph->edges) { if (\$sub->has_vertex(@\$edge[0]) && \$sub->has_vertex(@\$edge[1])) { \$sub->add_edge(@\$edge[0],@\$edge[1]); } } while (my @V = grep { \$sub->in_degree(\$_)==1 && \$sub->out_degree(\$_)==1 } \$sub->interior_vertices) { my \$v = shift @V; my @U = \$sub->predecessors(\$v); my \$u = shift @U; my @W = \$sub->successors(\$v); my \$w = shift @W; \$sub->add_edge(\$u,\$w); \$sub->delete_vertex(\$v); } return \$sub; } bottom.up.subgraph <- function (net,i) { Q <- c(i) S <- c() while (length(Q)>0) { i <- Q[1] Q <- Q[-1] S <- c(S,i) Q <- c(Q,as.vector(V(net)[nei(i,mode="out")])) } g <- subgraph(net,unique(S)) while (length(V(g)[which(degree(g,mode="in")==1 & degree(g,mode="out")==1)-1])>0) { D <- V(g)[which(degree(g,mode="in")==1 & degree(g,mode="out")==1)-1] v <- head(c(D),n=1) # elementary node u <- V(g)[nei(v,"in")] w <- V(g)[nei(v,"out")] g <- add.edges(g,c(V(g)[u],V(g)[w])) g <- delete.vertices(g,V(g)[v]) } g } sub graph_isomorphic { my \$g1 = shift; my \$g2 = shift; my \$found = 0; if (\$g1->could_be_isomorphic(\$g2)) { my \$C1 = cluster_map(\$g1); my \$C2 = cluster_map(\$g2); my %X; for my \$node1 (\$g1->vertices) { for my \$node2 (\$g2->vertices) { my \$lc = List::Compare->new(@{\$C1}{\$node1}, @{\$C2}{\$node2}); if (\$lc->is_LequivalentR) { push @{\$X{\$node1}}, \$node2; } } } my \$node1 = undef; my %M; my @V1 = sort \$g1->vertices; backtrack(\$g1,\$g2,\%X,\$node1,\%M,\@V1,\\$found); } return \$found; } sub backtrack { my (\$g1,\$g2,\$X,\$node1,\$M,\$V1,\$found) = @_; my %X = %{\$X}; my %M = %{\$M}; my @V1 = @{\$V1}; unless (\$\$found) { for my \$prev1 (keys %M) { if (\$g1->has_edge(\$prev1,\$node1) && !\$g2->has_edge(\$M{\$prev1},\$M{\$node1}) || \$g1->has_edge(\$node1,\$prev1) && !\$g2->has_edge(\$M{\$node1},\$M{\$prev1}) || !\$g1->has_edge(\$prev1,\$node1) && \$g2->has_edge(\$M{\$prev1},\$M{\$node1}) || !\$g1->has_edge(\$node1,\$prev1) && \$g2->has_edge(\$M{\$node1},\$M{\$prev1})) { return; } } if (scalar @V1) { \$node1 = shift @V1; for my \$node2 (@{\$X{\$node1}}) { \$M{\$node1} = \$node2; backtrack(\$g1,\$g2,\$X,\$node1,\%M,\@V1,\$found); \$M{\$node1} = undef; } } else { \$\$found = 1; } } } sub cluster_map { my \$graph = shift; my %C; for my \$node (\$graph->vertices) { for my \$leaf (\$graph->successorless_vertices) { if (\$graph->is_reachable(\$node,\$leaf)) { push @{\$C{\$node}}, \$leaf; } } } return \%C; } sub max_agreement_subgraph { my \$net1 = shift; my \$net2 = shift; my \$graph1 = \$net1->graph; my \$graph2 = \$net2->graph; my \$common = Graph->new; my \$size = 0; for my \$node1 (\$graph1->interior_vertices) { my \$sub1 = subgraph(\$graph1,\$node1); for my \$node2 (\$graph2->interior_vertices) { my \$sub2 = subgraph(\$graph2,\$node2); if (graph_isomorphic(\$sub1,\$sub2)) { my \$leaves1 = \$sub1->successorless_vertices; if (\$leaves1 > \$size) { \$common = \$sub1; \$size = \$leaves1; } } } } return \$common; } max.agreement.subgraph <- function (net1,net2) { max <- graph.empty() max.size <- 0 for (i in V(net1)) { g1 <- bottom.up.subgraph(net1,i) for (j in V(net2)) { g2 <- bottom.up.subgraph(net2,j) if (graph.isomorphic(g1,g2)) { L <- V(g1)[which(degree(g1,mode="out")==0)-1] if (length(L)>max.size) { max <- g1 max.size <- length(L) } } } } max }

#### Section 10.3: Comparing Graphs

##### Section 10.3.1: The Triplets Distance between Graphs
 sub triplets_distance { my \$net1 = shift; my \$net2 = shift; my \$comp = Array::Compare->new; my @L = \$net1->leaves; my \$dist = 0; for (my \$i = 0; \$i < @L; \$i++) { for (my \$j = \$i+1; \$j < @L; \$j++) { for (my \$k = \$j+1; \$k < @L; \$k++) { my \$t1 = triplet(\$net1,\$i,\$j,\$k); my \$t2 = triplet(\$net2,\$i,\$j,\$k); \$dist += 2 unless (\$t1 eq \$t2); } } } return \$dist; } triplets.distance <- function (net1,net2) { L <- sort(V(net1)[which(degree(net1,mode="out")==0)-1]\$name) d <- 0 for (i in L[1:(length(L)-2)]) { i1 <- V(net1)[V(net1)\$name == i] i2 <- V(net2)[V(net2)\$name == i] for (j in L[(match(i,L)+1):(length(L)-1)]) { j1 <- V(net1)[V(net1)\$name == j] j2 <- V(net2)[V(net2)\$name == j] for (k in L[(match(j,L)+1):length(L)]) { k1 <- V(net1)[V(net1)\$name == k] k2 <- V(net2)[V(net2)\$name == k] t1 <- triplet(net1,i1,j1,k1) t2 <- triplet(net2,i2,j2,k2) if (!isTRUE(all.equal(t1,t2))) { d <- d + 2 } } } } d }

### Appendix A: Elements of Perl

#### Section A.1: Perl Scripts

 sub rna_to_protein { my \$rna = shift; my \$protein; my \$i = 0; while (\$i < length(\$rna) - 2 && substr(\$rna,\$i,3) ne "AUG") { # start codon \$i++; } \$i += 3; # skip the start codon while (\$i < length(\$rna) - 2 && substr(\$rna,\$i,3) ne "UAA" && substr(\$rna,\$i,3) ne "UAG" && substr(\$rna,\$i,3) ne "UGA") { my \$codon = substr(\$rna,\$i,3); \$protein .= codon_to_amino_acid(\$codon); \$i += 3; } return \$protein; } sub codon_to_amino_acid { my \$codon = shift; if (\$codon eq "UUU") { return "F" } # Phe elsif (\$codon eq "UUC") { return "F" } # Phe elsif (\$codon eq "UUA") { return "L" } # Leu elsif (\$codon eq "UUG") { return "L" } # Leu elsif (\$codon eq "UCU") { return "S" } # Ser elsif (\$codon eq "UCC") { return "S" } # Ser elsif (\$codon eq "UCA") { return "S" } # Ser elsif (\$codon eq "UCG") { return "S" } # Ser elsif (\$codon eq "UAU") { return "Y" } # Tyr elsif (\$codon eq "UAC") { return "Y" } # Tyr elsif (\$codon eq "UAA") { return "-" } # stop elsif (\$codon eq "UAG") { return "-" } # stop elsif (\$codon eq "UGU") { return "C" } # Cys elsif (\$codon eq "UGC") { return "C" } # Cys elsif (\$codon eq "UGA") { return "-" } # stop elsif (\$codon eq "UGG") { return "W" } # Trp elsif (\$codon eq "CUU") { return "L" } # Leu elsif (\$codon eq "CUC") { return "L" } # Leu elsif (\$codon eq "CUA") { return "L" } # Leu elsif (\$codon eq "CUG") { return "L" } # Leu elsif (\$codon eq "CCU") { return "P" } # Pro elsif (\$codon eq "CCC") { return "P" } # Pro elsif (\$codon eq "CCA") { return "P" } # Pro elsif (\$codon eq "CCG") { return "P" } # Pro elsif (\$codon eq "CAU") { return "H" } # His elsif (\$codon eq "CAC") { return "H" } # His elsif (\$codon eq "CAA") { return "Q" } # Gln elsif (\$codon eq "CAG") { return "Q" } # Gln elsif (\$codon eq "CGU") { return "R" } # Arg elsif (\$codon eq "CGC") { return "R" } # Arg elsif (\$codon eq "CGA") { return "R" } # Arg elsif (\$codon eq "CGG") { return "R" } # Arg elsif (\$codon eq "AUU") { return "I" } # Ile elsif (\$codon eq "AUC") { return "I" } # Ile elsif (\$codon eq "AUA") { return "I" } # Ile elsif (\$codon eq "AUG") { return "M" } # Met elsif (\$codon eq "ACU") { return "T" } # Thr elsif (\$codon eq "ACC") { return "T" } # Thr elsif (\$codon eq "ACA") { return "T" } # Thr elsif (\$codon eq "ACG") { return "T" } # Thr elsif (\$codon eq "AAU") { return "N" } # Asn elsif (\$codon eq "AAC") { return "N" } # Asn elsif (\$codon eq "AAA") { return "K" } # Lys elsif (\$codon eq "AAG") { return "K" } # Lys elsif (\$codon eq "AGU") { return "S" } # Ser elsif (\$codon eq "AGC") { return "S" } # Ser elsif (\$codon eq "AGA") { return "R" } # Arg elsif (\$codon eq "AGG") { return "R" } # Arg elsif (\$codon eq "GUU") { return "V" } # Val elsif (\$codon eq "GUC") { return "V" } # Val elsif (\$codon eq "GUA") { return "V" } # Val elsif (\$codon eq "GUG") { return "V" } # Val elsif (\$codon eq "GCU") { return "A" } # Ala elsif (\$codon eq "GCC") { return "A" } # Ala elsif (\$codon eq "GCA") { return "A" } # Ala elsif (\$codon eq "GCG") { return "A" } # Ala elsif (\$codon eq "GAU") { return "D" } # Asp elsif (\$codon eq "GAC") { return "D" } # Asp elsif (\$codon eq "GAA") { return "E" } # Glu elsif (\$codon eq "GAG") { return "E" } # Glu elsif (\$codon eq "GGU") { return "G" } # Gly elsif (\$codon eq "GGC") { return "G" } # Gly elsif (\$codon eq "GGA") { return "G" } # Gly elsif (\$codon eq "GGG") { return "G" } # Gly else { return "*" } } my %codon2aa = ( "UUU" => "F", # Phe "UUC" => "F", # Phe "UUA" => "L", # Leu "UUG" => "L", # Leu "UCU" => "S", # Ser "UCC" => "S", # Ser "UCA" => "S", # Ser "UCG" => "S", # Ser "UAU" => "Y", # Tyr "UAC" => "Y", # Tyr "UAA" => "-", # stop "UAG" => "-", # stop "UGU" => "C", # Cys "UGC" => "C", # Cys "UGA" => "-", # stop "UGG" => "W", # Trp "CUU" => "L", # Leu "CUC" => "L", # Leu "CUA" => "L", # Leu "CUG" => "L", # Leu "CCU" => "P", # Pro "CCC" => "P", # Pro "CCA" => "P", # Pro "CCG" => "P", # Pro "CAU" => "H", # His "CAC" => "H", # His "CAA" => "Q", # Gln "CAG" => "Q", # Gln "CGU" => "R", # Arg "CGC" => "R", # Arg "CGA" => "R", # Arg "CGG" => "R", # Arg "AUU" => "I", # Ile "AUC" => "I", # Ile "AUA" => "I", # Ile "AUG" => "M", # Met "ACU" => "T", # Thr "ACC" => "T", # Thr "ACA" => "T", # Thr "ACG" => "T", # Thr "AAU" => "N", # Asn "AAC" => "N", # Asn "AAA" => "K", # Lys "AAG" => "K", # Lys "AGU" => "S", # Ser "AGC" => "S", # Ser "AGA" => "R", # Arg "AGG" => "R", # Arg "GUU" => "V", # Val "GUC" => "V", # Val "GUA" => "V", # Val "GUG" => "V", # Val "GCU" => "A", # Ala "GCC" => "A", # Ala "GCA" => "A", # Ala "GCG" => "A", # Ala "GAU" => "D", # Asp "GAC" => "D", # Asp "GAA" => "E", # Glu "GAG" => "E", # Glu "GGU" => "G", # Gly "GGC" => "G", # Gly "GGA" => "G", # Gly "GGG" => "G" # Gly ); sub rna_to_protein { my \$rna = shift; my \$protein; my \$aa; my \$i = 0; while (\$i < length(\$rna) - 2 && substr(\$rna,\$i,3) ne "AUG") { # start codon \$i++; } \$i += 3; # skip the start codon while (\$i < length(\$rna) - 2 && substr(\$rna,\$i,3) ne "UAA" && substr(\$rna,\$i,3) ne "UAG" && substr(\$rna,\$i,3) ne "UGA") { my \$codon = substr(\$rna,\$i,3); if (defined \$codon2aa{\$codon}) { \$aa = \$codon2aa{\$codon}; } else { \$aa = "*"; } \$protein .= \$aa; \$i += 3; } return \$protein; }

#### Section A.2: Overview of Perl

 package DNA; sub reverse_complement { my \$seq = shift; \$seq = reverse \$seq; \$seq =~ tr/ACGT/TGCA/; return \$seq; } package RNA; sub reverse_complement { my \$seq = shift; \$seq = reverse \$seq; \$seq =~ tr/ACGU/UGCA/; return \$seq; } 1;

### Appendix B: Elements of R

#### Section B.1: R Scripts

 rna.to.protein <- function (rna) { protein <- "" i <- 1 while (i < nchar(rna)-1 && substr(rna,i,i+2) != "AUG") { # start codon i <- i+1 } i <- i+3 # skip the start codon while (i < nchar(rna)-1 && substr(rna,i,i+2) != "UAA" && substr(rna,i,i+2) != "UAG" && substr(rna,i,i+2) != "UGA") { codon <- substr(rna,i,i+2) protein <- paste(protein, codon.to.amino.acid(codon),sep="") i <- i+3 } return(protein) } codon.to.amino.acid <- function (codon) { if (codon == "UUU") return("F") # Phe else if (codon == "UUC") return("F") # Phe else if (codon == "UUA") return("L") # Leu else if (codon == "UUG") return("L") # Leu else if (codon == "UCU") return("S") # Ser else if (codon == "UCC") return("S") # Ser else if (codon == "UCA") return("S") # Ser else if (codon == "UCG") return("S") # Ser else if (codon == "UAU") return("Y") # Tyr else if (codon == "UAC") return("Y") # Tyr else if (codon == "UAA") return("-") # stop else if (codon == "UAG") return("-") # stop else if (codon == "UGU") return("C") # Cys else if (codon == "UGC") return("C") # Cys else if (codon == "UGA") return("-") # stop else if (codon == "UGG") return("W") # Trp else if (codon == "CUU") return("L") # Leu else if (codon == "CUC") return("L") # Leu else if (codon == "CUA") return("L") # Leu else if (codon == "CUG") return("L") # Leu else if (codon == "CCU") return("P") # Pro else if (codon == "CCC") return("P") # Pro else if (codon == "CCA") return("P") # Pro else if (codon == "CCG") return("P") # Pro else if (codon == "CAU") return("H") # His else if (codon == "CAC") return("H") # His else if (codon == "CAA") return("Q") # Gln else if (codon == "CAG") return("Q") # Gln else if (codon == "CGU") return("R") # Arg else if (codon == "CGC") return("R") # Arg else if (codon == "CGA") return("R") # Arg else if (codon == "CGG") return("R") # Arg else if (codon == "AUU") return("I") # Ile else if (codon == "AUC") return("I") # Ile else if (codon == "AUA") return("I") # Ile else if (codon == "AUG") return("M") # Met else if (codon == "ACU") return("T") # Thr else if (codon == "ACC") return("T") # Thr else if (codon == "ACA") return("T") # Thr else if (codon == "ACG") return("T") # Thr else if (codon == "AAU") return("N") # Asn else if (codon == "AAC") return("N") # Asn else if (codon == "AAA") return("K") # Lys else if (codon == "AAG") return("K") # Lys else if (codon == "AGU") return("S") # Ser else if (codon == "AGC") return("S") # Ser else if (codon == "AGA") return("R") # Arg else if (codon == "AGG") return("R") # Arg else if (codon == "GUU") return("V") # Val else if (codon == "GUC") return("V") # Val else if (codon == "GUA") return("V") # Val else if (codon == "GUG") return("V") # Val else if (codon == "GCU") return("A") # Ala else if (codon == "GCC") return("A") # Ala else if (codon == "GCA") return("A") # Ala else if (codon == "GCG") return("A") # Ala else if (codon == "GAU") return("D") # Asp else if (codon == "GAC") return("D") # Asp else if (codon == "GAA") return("E") # Glu else if (codon == "GAG") return("E") # Glu else if (codon == "GGU") return("G") # Gly else if (codon == "GGC") return("G") # Gly else if (codon == "GGA") return("G") # Gly else if (codon == "GGG") return("G") # Gly else return("*") } codon2aa <- c( "UUU" = "F", # Phe "UUC" = "F", # Phe "UUA" = "L", # Leu "UUG" = "L", # Leu "UCU" = "S", # Ser "UCC" = "S", # Ser "UCA" = "S", # Ser "UCG" = "S", # Ser "UAU" = "Y", # Tyr "UAC" = "Y", # Tyr "UAA" = "-", # stop "UAG" = "-", # stop "UGU" = "C", # Cys "UGC" = "C", # Cys "UGA" = "-", # stop "UGG" = "W", # Trp "CUU" = "L", # Leu "CUC" = "L", # Leu "CUA" = "L", # Leu "CUG" = "L", # Leu "CCU" = "P", # Pro "CCC" = "P", # Pro "CCA" = "P", # Pro "CCG" = "P", # Pro "CAU" = "H", # His "CAC" = "H", # His "CAA" = "Q", # Gln "CAG" = "Q", # Gln "CGU" = "R", # Arg "CGC" = "R", # Arg "CGA" = "R", # Arg "CGG" = "R", # Arg "AUU" = "I", # Ile "AUC" = "I", # Ile "AUA" = "I", # Ile "AUG" = "M", # Met "ACU" = "T", # Thr "ACC" = "T", # Thr "ACA" = "T", # Thr "ACG" = "T", # Thr "AAU" = "N", # Asn "AAC" = "N", # Asn "AAA" = "K", # Lys "AAG" = "K", # Lys "AGU" = "S", # Ser "AGC" = "S", # Ser "AGA" = "R", # Arg "AGG" = "R", # Arg "GUU" = "V", # Val "GUC" = "V", # Val "GUA" = "V", # Val "GUG" = "V", # Val "GCU" = "A", # Ala "GCC" = "A", # Ala "GCA" = "A", # Ala "GCG" = "A", # Ala "GAU" = "D", # Asp "GAC" = "D", # Asp "GAA" = "E", # Glu "GAG" = "E", # Glu "GGU" = "G", # Gly "GGC" = "G", # Gly "GGA" = "G", # Gly "GGG" = "G" # Gly ) rna.to.protein <- function (rna) { protein <- "" i <- 1 while (i < nchar(rna)-1 && substr(rna,i,i+2) != "AUG") { # start codon i <- i+1 } i <- i+3 # skip the start codon while (i < nchar(rna)-1 && substr(rna,i,i+2) != "UAA" && substr(rna,i,i+2) != "UAG" && substr(rna,i,i+2) != "UGA") { codon <- substr(rna,i,i+2) if (is.na(codon2aa[codon])) { aa <- "*" } else { aa <- codon2aa[codon] } protein <- paste(protein,aa,sep="") i <- i+3 } return(protein) }

#### Section B.2: Overview of R

 reverse.complement <- function (seq) UseMethod("reverse.complement") reverse.complement.dna <- function (seq) { rev <- paste(rev(unlist(strsplit(seq,split=""))),sep="",collapse="") chartr("ACGT","TGCA",rev) } reverse.complement.rna <- function (seq) { rev <- paste(rev(unlist(strsplit(seq,split=""))),sep="",collapse="") chartr("ACGU","UGCA",rev) }

#### Section B.3: R Quick Reference Card

Gabriel Valiente valiente@ (lsi.upc.edu)