Gabriel Valiente

Combinatorial Pattern Matching Algorithms in Computational Biology using Perl and R

Here is the source code for Combinatorial Pattern Matching Algorithms in Computational Biology using Perl and R, by Gabriel Valiente. It has been tested using Perl version 5.8.6 (BioPerl version 1.5) and R version 2.7.1 (seqinr version 1.1-7 and APE version 2.1-3).

Chapter 1: Introduction

Section 1.1: Combinatorial Pattern Matching

Section 1.2: Computational Biology

Section 1.3: A Motivating Example: Gene Prediction

sub extract_open_reading_frames {
  my $seq = shift;
  for my $r (0,1,2) {
    for (my $i = $r; $i <= length($seq)-3; $i += 3) {
      if (substr($seq,$i,3) eq "ATG") {
        my $j = $i+3;
        while ($j <= length($seq)-3 &&
            substr($seq,$j,3) ne "TAA" &&
            substr($seq,$j,3) ne "TAG" &&
            substr($seq,$j,3) ne "TGA") {
          $j += 3;
        }
        if ($j <= length($seq)-3) {
          my $len = $j+2-$i+1;
          if ($len >= 30) {
            print substr($seq,$i,$j+2-$i+1),"\n";
          }
        }
      }
    }
  }
}

extract.open.reading.frames <- function (seq) {
  for (i in 1:3) {
    while (i+2 <= nchar(seq)) {
      if (substr(seq,i,i+2) == "ATG") {
        j <- i + 3
        while (j+2 <= nchar(seq) &&
            substr(seq,j,j+2) != "TAA" &&
            substr(seq,j,j+2) != "TAG" &&
            substr(seq,j,j+2) != "TGA") {
          j <- j + 3
        }
        if (j+2 <= nchar(seq)) {
          if (j+2-i+1 >= 30) {
            print(c(i,j+2,substr(seq,i,j+2)))
          }
        }
      }
      i <- i + 3
    }
  }
}

sub longest_open_reading_frame {
  my $seq = shift;
  my ($ii,$jj) = (0,0);
  for my $r (0,1,2) {
    for (my $i = $r; $i <= length($seq)-3; $i += 3) {
      if (substr($seq,$i,3) eq "ATG") {
        my $j = $i+3;
        while ($j <= length($seq)-3 &&
            substr($seq,$j,3) ne "TAA" &&
            substr($seq,$j,3) ne "TAG" &&
            substr($seq,$j,3) ne "TGA") {
          $j += 3;
        }
        if ($j <= length($seq)-3) {
          my $len = $j+2-$i+1;
          if ($j+2-$i+1 > $jj+2-$ii+1) {
            $ii = $i;
            $jj = $j;
          }
        }
      }
    }
  }
  return [$ii,$jj+2];
}

longest.open.reading.frame <- function (seq) {
  ii <- jj <- 0
  for (i in 1:3) {
    while (i+2 <= nchar(seq)) {
      if (substr(seq,i,i+2) == "ATG") {
        j <- i + 3
        while (j+2 <= nchar(seq) &&
            substr(seq,j,j+2) != "TAA" &&
            substr(seq,j,j+2) != "TAG" &&
            substr(seq,j,j+2) != "TGA") {
          j <- j + 3
        }
        if (j+2 <= nchar(seq)) {
          if (j+2-i+1 > jj+2-ii+1) {
            ii <- i
            jj <- j
          }
        }
      }
      i <- i + 3
    }
  }
  c(ii,jj+2)
}

Chapter 2: Sequences

Section 2.1: Sequences in Mathematics

Section 2.1.1: Counting Labeled Sequences

Section 2.2: Sequences in Computer Science

Section 2.2.1: Traversing Labeled Sequences

sub seq_to_labeled_seq {
  my $seq = shift;
  my %seq;
  for (my $i = 0; $i < length($seq); $i++) {
    my $elem = substr($seq,$i,1);
    $seq{$elem}++;
  }
  return \%seq;
}

seq.to.labeled.seq <- function (seq) {
  alphabet <- sort(unique(strsplit(seq,"")[[1]]))
  lab <- matrix(0,nrow=length(alphabet),ncol=1)
  dimnames(lab) <- list(alphabet,"count")
  for (i in 1:nchar(seq))
    lab[substr(seq,i,i),] <-
      lab[substr(seq,i,i),] + 1
  lab
}

sub symmetric_difference {
  my $seq1 = shift;
  my $seq2 = shift;

  my %seq1 = %{ seq_to_labeled_seq($seq1) };
  my %seq2 = %{ seq_to_labeled_seq($seq2) };

  for my $elem (keys %seq2) {
    $seq1{$elem} = abs($seq1{$elem}-$seq2{$elem});
  }

  return \%seq1;
}

symmetric.difference <- function (seq1,seq2) {
    lab1 <- seq.to.labeled.seq(seq1)
    lab2 <- seq.to.labeled.seq(seq2)
    for (elem in row.names(lab2))
      lab1[elem,] <- abs(lab1[elem,] - lab2[elem,])
    lab1
}

Section 2.3: Sequences in Computational Biology

Section 2.3.1: Reverse Complementing DNA Sequences

sub reverse_complement {
  my $seq = shift;
  $seq = reverse $seq;
  $seq =~ tr/ACGT/TGCA/;
  return $seq;
}

reverse.complement <- function (seq) {
  rev <- paste(rev(unlist(strsplit(seq,split=""))),sep="",collapse="")
  chartr("ACGT","TGCA",rev)
}

Section 2.3.2: Counting RNA Sequences

sub count {
  my $n = shift;
  my @R;
  $R[0] = $R[1] = $R[2] = 1;
  for (my $i = 2; $i < $n; $i++) {
    $R[$i+1] = $R[$i];
    for (my $j = 1; $j < $i; $j++) {
      $R[$i+1] += $R[$j-1] * $R[$i-$j]
    }
  }
  return $R[$n];
}

count <- function (n) {
  R <- rep(0,n)
  R[1:3] <- 1
  if (n > 2) {
    for (i in 2:(n-1)) {
      R[i+2] <- R[i+1]
      for (j in 1:(i-1)) {
        R[i+2] <- R[i+2] + R[j] * R[i-j+1]
      }
    }
  }
  R[n+1]
}

Section 2.3.3: Generating DNA Sequences

sub words {
  my $n = shift;
  my $alphabet = shift;
  my @n = split "", $alphabet;
  if ($n == 1) {
    return \@n;
  } else {
    my $short = words($n - 1, $alphabet);
    my @long;
    for my $seq (@$short) {
      for my $n (@n) {
        push @long, $seq . $n;
      }
    }
    return \@long;
  }
}

library(seqinr)

words(length=3,alphabet=s2c("ACGT"))

Section 2.3.4: Representing Sequences in Perl

Section 2.3.5: Representing Sequences in R

Chapter 3: Simple Pattern Matching in Sequences

Section 3.1: Finding Words in Sequences

Section 3.1.1: Word Composition of Sequences

sub word_composition {
  my $seq = shift;
  my $k = shift;
  my $alphabet = shift;

  my $words = words($k,$alphabet);

  my %freq;
  for my $word (@$words) {
    $freq{$word} = 0;
  }

  for my $i (0 .. length($seq)-$k) {
    my $word = substr($seq,$i,$k);
    $freq{$word}++;
  }

  return \%freq;
}

library(seqinr)

count(s2c(seq),word=3,alphabet=s2c("ACGT"))

Section 3.1.2: Alignment Free Comparison of Sequences

	library(seqinr) alignment.free.distance <- function (seq1,seq2,k,sigma) { freq1 <- count(seq1,word=k,alphabet=s2c(sigma)) freq2 <- count(seq2,word=k,alphabet=s2c(sigma)) cor(freq1,freq2) }
sub mean { my $a = shift; my $res; foreach (@$a) { $res += $_ } $res /= @$a; }
sub sd { my $a = shift; my $mean = mean($a); return sqrt( mean( [map $_ 2, @$a] ) - ($mean 2) ); }
sub cov { my $a1 = shift; my $a2 = shift; my $res; foreach (0 .. @$a1 - 1) { $res += $a1->[$_] * $a2->[$_]; } $res /= @$a1; $res -= mean($a1) * mean($a2); }
sub cor { my $a1 = shift; my $a2 = shift; return cov($a1,$a2) / (sd($a1) * sd($a2)); }

Chapter 4: General Pattern Matching in Sequences

Section 4.1: Finding Subsequences

sub occurrences {
  my $p = shift;
  my $t = shift;
  my $m = length $p;
  my $n = length $t;
  my @L;
  for (my $i = 0; $i < $n-$m+1; $i++) {
    if ($p eq substr($t,$i,$m)) {
      push @L, $i;
    }
  }
  return \@L;
}

occurrences <- function (p,t) {
  L <- c()
  for (i in 1:nchar(t)-nchar(p)+1)
    if (p == substr(t,i,i+nchar(p)-1))
      L <- c(L,i)
  L
}

Section 4.1.1: Suffix Arrays

sub suffix_array { my $s = shift; my @sa = (0 .. length($s)-1); @sa = sort {substr($s,$a) cmp substr($s,$b)} @sa; return \@sa; }	suffix.array <- function (seq) { sa <- apply(matrix(1:nchar(seq)),1,function(i)substr(seq,i,nchar(seq))) order(sa) }
sub occurrences { my $p = shift; my $t = shift; my $sa = shift; my ($l, $r) = (0, $#sa); while ($l <= $r) { my $i = int(($l + $r)/2); my $c = $p cmp substr($t, $sa[$i], length $p); if ($c < 0) { $r = $i - 1; } elsif ($c > 0) { $l = $i + 1; } else { my $j = $i; while ($i > 0 && $p eq substr($t, $sa[$i-1], length $p)) { $i--; } while ($j < $#sa-1 && $p eq substr($t, $sa[$j+1], length $p)) { $j++; } return ($i, $j); } } return (-1, -1); }	occurrences <- function (p,t,sa) { ell <- 1 r <- nchar(t) while (ell <= r) { i <- as.integer((ell + r) / 2) s <- substr(t,sa[i],sa[i]+nchar(p)-1) if (p < s) { r <- i - 1 } else if (p > s) { ell <- i + 1 } else { j <- i while (i > 1 && p==substr(t,sa[i-1],sa[i-1]+nchar(p)-1)) { i <- i - 1 } while (j < nchar(t) && p==substr(t,sa[j+1],sa[j+1]+nchar(p)-1)) { j <- j + 1 } return(c(i,j)) } } return(c(-1,-1)) }
sub lcp { my ($p, $i, $t, $j) = @_; my $c = 0; while ($i+$c < length $p && $j+$c < length $t && substr($p,$i+$c,1) eq substr($t,$j+$c,1)) { $c++; } return $c; }	lcp <- function (seq1,pos1,seq2,pos2) { n1 <- nchar(seq1) n2 <- nchar(seq2) c <- 0 while (pos1+c <= n1 && pos2+c <= n2 && substr(seq1,pos1+c,pos1+c) == substr(seq2,pos2+c,pos2+c)) { c <- c + 1 } c }
sub occurrences { my $p = shift; my $t = shift; my $sa = shift; my ($l, $r) = (0, $#sa); my ($llcp, $rlcp) = (0, 0); while ($l <= $r) { my $i = int(($l + $r)/2); my $h = $llcp; $h = $rlcp if $rlcp < $h; my $c = lcp($p,$h,$t,$sa[$i]+$h); if ($h+$c == length $p) { my $j = $i; while ($i > 0 && $p eq substr($t, $sa[$i-1], length $p)) { $i--; } while ($j < $#sa-1 && $p eq substr($t, $sa[$j+1], length $p)) { $j++; } return ($i, $j); } elsif (substr($p,$h+$c,1) lt substr($t,$sa[$i]+$h+$c,1)) { $r = $i - 1; $rlcp = $h + $c; } else { $l = $i + 1; $llcp = $h + $c; } } return (-1, -1); }	occurrences <- function (p,t,sa) { ell <- 1 r <- nchar(t) llcp <- 0 rlcp <- 0 while (ell <= r) { i <- as.integer((ell + r) / 2) h <- min(llcp,rlcp) c <- lcp(p,h+1,t,sa[i]+h) if (h+c == nchar(p)) { j <- i while (i > 1 && p==substr(t,sa[i-1],sa[i-1]+nchar(p)-1)) { i <- i - 1 } while (j < nchar(t) && p==substr(t,sa[j+1],sa[j+1]+nchar(p)-1)) { j <- j + 1 } return(c(i,j)) } else if (substr(p,h+c+1,h+c+1) < substr(t,sa[i]+h+c,sa[i]+h+c)) { r <- i - 1 rlcp <- h + c } else { ell <- i + 1 llcp <- h + c } } return(c(-1,-1)) }

Section 4.2: Finding Common Subsequences

sub common_occurrences {
  my $p = shift;
  my $t1 = shift;
  my $t2 = shift;
  my $m = length $p;
  my $n1 = length $t1;
  my $n2 = length $t2;
  my @L;
  for (my $i = 0; $i < $n1-$m+1; $i++) {
    if ($p eq substr($t1,$i,$m)) {
      for (my $j = 0; $j < $n2-$m+1; $j++) {
        if ($p eq substr($t2,$j,$m)) {
          push @L, $i, $j;
        }
      }
    }
  }
  return \@L;
}

common.occurrences <- function (p,t1,t2) {
  L <- c()
  for (i in 1:nchar(t1)-nchar(p)+1)
    if (p == substr(t1,i,i+nchar(p)-1))
      for (j in 1:nchar(t2)-nchar(p)+1)
        if (p == substr(t2,j,j+nchar(p)-1))
          L <- c(L,i,j)
  L
}

sub longest_common_subsequences {
  my $s1 = shift;
  my $s2 = shift;
  my $n1 = length($s1);
  my $n2 = length($s2);
  my @L;
  my $l = 0;
  for (my $i = 0; $i < $n1; $i++) {
    for (my $j = $i; $j < $n1; $j++) {
      my $seq = substr($s1,$i,$j-$i+1);
      for (my $k = 0; $k < $n2-$j+$i; $k++) {
        if ($seq eq substr($s2,$k,$j-$i+1)) {
          if (length($seq) == $l) {
            push @L, $seq;
          } elsif (length($seq) > $l) {
            @L = $seq;
            $l = length($seq);
          }
        }
      }
    }
  }
  my %count;
  for my $seq (@L) { $count{$seq}++; }
  @L = sort keys %count;
  return \@L;
}

longest.common.subsequences <- function (s1,s2) {
  L <- c()
  ell <- 0
  for (i in 1:nchar(s1)) {
    for (j in i:nchar(s1)) {
      seq <- substr(s1,i,j)
      for (k in 1:(nchar(s2)-j+i)) {
        if (seq == substr(s2,k,k+j-i)) {
          if (nchar(seq) == ell) {
            L <- c(L,seq)
          } else if (nchar(seq) > ell) {
            L <- c(seq)
            ell <- nchar(seq)
          }
        }
      }
    }
  }
  unique(L)
}

sub longest_common_subsequences {
  my $s1 = shift;
  my $s2 = shift;
  my @L;
  my $l = 0;
  my @LCS;
  for (my $i = 0; $i < length $s1; $i++ ) {
    for (my $j = 0; $j < length $s2; $j++ ) {
      $LCS[$i][$j] = 0;
      if (substr($s1,$i,1) eq substr($s2,$j,1)) {
        if ($i == 0 || $j == 0) {
          $LCS[$i][$j] = 1;
        } else {
          $LCS[$i][$j] = $LCS[$i-1][$j-1] + 1;
        }
        if ($LCS[$i][$j] > $l) {
          $l = $LCS[$i][$j];
          @L = ();
        }
        if ($LCS[$i][$j] == $l) {
          push @L, substr($s1,$i-$l+1,$l);
        }
      }
    }
  }
  return @L;
}

longest.common.subsequences <- function (s1,s2) {
  L <- c()
  ell <- 0
  LCS <- matrix(0,nrow=nchar(s1),ncol=nchar(s2))
  for (i in 1:nchar(s1)) {
    for (j in 1:nchar(s2)) {
      if (substr(s1,i,i) == substr(s2,j,j)) {
        if (i == 1 || j == 1) {
          LCS[i,j] <- 1
        } else {
          LCS[i,j] <- LCS[i-1,j-1] + 1
        }
        if (LCS[i,j] > ell) {
          ell <- LCS[i,j]
          L <- c()
        }
        if (LCS[i,j] == ell) {
          L <- c(L,substr(s1,i-ell+1,i))
        }
      }
    }
  }
  L
}

Section 4.2.1: Generalized Suffix Arrays

sub generalized_suffix_array { my $seq = shift; my %seq = %{$seq}; my @gsa; for my $id (keys %seq) { for my $i (0 .. length($seq{$id})-1) { push @gsa, [$i, $id]; } } @gsa = sort {substr($seq{@{$a}[1]}, @{$a}[0]) cmp substr($seq{@{$b}[1]}, @{$b}[0])} @gsa; return \@gsa; }	generalized.suffix.array <- function (s1,s2) { sa1 <- apply(matrix(1:nchar(s1)),1, function (i) substr(s1,i,nchar(s1))) sa2 <- apply(matrix(1:nchar(s2)),1, function (i) substr(s2,i,nchar(s2))) arr <- order(c(sa1,sa2)) tag <- sapply(arr,function (i) if (i <= nchar(s1)) { 1 } else { 2 } ) arr <- sapply(arr,function (i) if (i <= nchar(s1)) { i } else { i-nchar(s1) } ) cbind(arr,tag) }
sub common_occurrences { my $p = shift; my $seq = shift; my %seq = %{$seq}; my $gsa = shift; my @gsa = @{$gsa}; my ($l, $r) = (0, $#gsa); while ($l <= $r) { my $i = int(($l + $r)/2); my $c = $p cmp substr($seq{$gsa[$i][1]}, $gsa[$i][0], length $p); if ($c < 0) { $r = $i - 1; } elsif ($c > 0) { $l = $i + 1; } else { my $j = $i; while ($i > 0 && $p eq substr($seq{$gsa[$i-1][1]}, $gsa[$i-1][0], length $p)) { $i--; } while ($j < $#gsa && $p eq substr($seq{$gsa[$j+1][1]}, $gsa[$j+1][0], length $p)) { $j++; } return ($i, $j); } } return (-1, -1); }	common.occurrences <- function (p,seq,gsa) { ell <- 1 r <- n <- nchar(seq[1]) + nchar(seq[2]) while (ell <= r) { i <- as.integer((ell + r) / 2) s <- substr(seq[gsa[i,2]], gsa[i,1],gsa[i,1]+nchar(p)-1) if (p < s) { r <- i - 1 } else if (p > s) { ell <- i + 1 } else { j <- i while (i > 1 && p == s) { s <- substr(seq[gsa[i-1,2]], gsa[i-1,1],gsa[i-1,1]+nchar(p)-1) if (p == s) i <- i - 1 } s <- p while (j < n && p == s) { s <- substr(seq[gsa[j+1,2]], gsa[j+1,1],gsa[j+1,1]+nchar(p)-1) if (p == s) j <- j + 1 } return(c(i,j)) } } return(c(-1,-1)) }
sub common_occurrences { my $p = shift; my $seq = shift; my %seq = %{$seq}; my $gsa = shift; my @gsa = @{$gsa}; my ($l, $r) = (0, $#gsa); my ($llcp, $rlcp) = (0, 0); while ($l <= $r) { my $i = int(($l + $r)/2); my $h = $llcp; $h = $rlcp if $rlcp < $h; my $c = lcp($p,$h,$seq{$gsa[$i][1]},$gsa[$i][0]+$h); if ($h+$c == length $p) { my $j = $i; while ($i > 0 && $p eq substr($seq{$gsa[$i-1][1]}, $gsa[$i-1][0], length $p)) { $i--; } while ($j < $#gsa-1 && $p eq substr($seq{$gsa[$j+1][1]}, $gsa[$j+1][0], length $p)) { $j++; } return ($i, $j); } elsif (substr($p,$h+$c,1) lt substr($seq{$gsa[$i][1]},$gsa[$i][0]+$h+$c,1)) { $r = $i - 1; $rlcp = $h + $c; } else { $l = $i + 1; $llcp = $h + $c; } } return (-1, -1); }	common.occurrences <- function (p,seq,gsa) { ell <- 1 r <- n <- nchar(seq[1]) + nchar(seq[2]) llcp <- 0 rlcp <- 0 while (ell <= r) { i <- as.integer((ell + r) / 2) s <- substr(seq[gsa[i,2]],gsa[i,1], gsa[i,1]+nchar(p)-1) h <- min(llcp,rlcp) c <- lcp(p,h+1,s,1+h) if (h+c == nchar(p)) { j <- i while (i > 1 && p == s) { s <- substr(seq[gsa[i-1,2]],gsa[i-1,1], gsa[i-1,1]+nchar(p)-1) if (p == s) i <- i - 1 } s <- p while (j < n && p == s) { s <- substr(seq[gsa[j+1,2]],gsa[j+1,1], gsa[j+1,1]+nchar(p)-1) if (p == s) j <- j + 1 } return(c(i,j)) } else { s <- substr(seq[gsa[i,2]],gsa[i,1]+h+c, gsa[i,1]+h+c) if (substr(p,h+c+1,h+c+1) < s) { r <- i - 1 rlcp <- h + c } else { ell <- i + 1 llcp <- h + c } } } return(c(-1,-1)) }
sub longest_common_subsequences { my $seq = shift; my %seq = %{$seq}; my $gsa = shift; my @gsa = @{$gsa}; my @L; my $l = 0; my $f2 = substr($seq{$gsa[0][1]}, $gsa[0][0], length($seq{$gsa[0][1]})); for (my $i = 1; $i < length($seq{1})+length($seq{2}); $i++) { my $f1 = $f2; $f2 = substr($seq{$gsa[$i][1]}, $gsa[$i][0], length($seq{$gsa[$i][1]})); if ($gsa[$i][1] != $gsa[$i-1][1]) { my $p = lcp($f1,0,$f2,0); if ($p > $l) { $l = $p; @L = (); } if ($p == $l) { push @L, substr($seq{$gsa[$i][1]},$gsa[$i][0],$p); } } } return \@L; }	longest.common.subsequences <- function (seq,gsa) { L <- c() ell <- 0 suf2 <- substr(seq[gsa[1,2]],gsa[1,1], nchar(seq[gsa[1,2]])) for (i in 2:(nrow(gsa))) { suf1 <- suf2 suf2 <- substr(seq[gsa[i,2]],gsa[i,1], nchar(seq[gsa[i,2]])) if (gsa[i-1,2] != gsa[i,2]) { pref <- substr(suf1,1,lcp(suf1,1,suf2,1)) if (nchar(pref) > ell) { ell <- nchar(pref) L <- c() } if (nchar(pref) == ell) { L <- c(L,pref) } } } L }

Section 4.3: Comparing Sequences

Section 4.3.1: Edit Distance Based Comparison of Sequences

sub hamming_distance {
  my $s1 = shift;
  my $s2 = shift;
  my $d = -1;
  if (length $s1 == length $s2) {
    $d = 0;
    for (my $i = 0; $i < length $s1; $i++) {
      $d++ if substr($s1,$i,1) ne substr($s2,$i,1);
    }
  }
  return $d;
}

hamming.distance <- function (s1,s2) {
  if (nchar(s1) != nchar(s2))
    -1
  else
    sum(strsplit(s1,"")[[1]] != strsplit(s2,"")[[1]])
}

sub levenshtein_distance {
  my $s1 = shift;
  my $s2 = shift;
  my ($n1, $n2) = (length $s1, length $s2);

  my @d;
  for ( my $i = 0; $i <= $n1; $i++ ) {
    $d[$i][0] = $i;
  }
  for ( my $j = 0; $j <= $n2; $j++ ) {
    $d[0][$j] = $j;
  }

  my @t1 = split //, $s1;
  my @t2 = split //, $s2;

  for ( my $i = 1; $i <= $n1; $i++ ) {
    for ( my $j = 1; $j <= $n2; $j++ ) {
      if ($t1[$i-1] eq $t2[$j-1]) {
        $d[$i][$j] = $d[$i-1][$j-1];
      } else {
        $d[$i][$j] = $d[$i-1][$j] + 1;
        if ($d[$i][$j-1] + 1 < $d[$i][$j]) {
          $d[$i][$j] = $d[$i][$j-1] + 1;
        }
      }
    }
  }

  return \@d;
}

levenshtein.distance <- function(s1,s2) {
  t1 <- strsplit(s1,split="")[[1]]
  t2 <- strsplit(s2,split="")[[1]]
  n1 <- length(t1)
  n2 <- length(t2)
  d <- array(0,dim=c(n1+1,n2+1))
  d[,1] <- 0:n1
  d[1,] <- 0:n2
  for (i in 2:(n1+1))
    for (j in 2:(n2+1))
      if (t1[i-1] == t2[j-1])
        d[i,j] <- d[i-1,j-1]
      else
        d[i,j] <- min(d[i-1,j]+1, d[i,j-1]+1)
  d
}

sub edit_distance {
  my $s1 = shift;
  my $s2 = shift;
  my ($n1, $n2) = (length $s1, length $s2);
  my @d;
  for ( my $i = 0; $i <= $n1; $i++ ) {
    $d[$i][0] = $i;
  }
  for ( my $j = 0; $j <= $n2; $j++ ) {
    $d[0][$j] = $j;
  }

  my @t1 = split //, $s1;
  my @t2 = split //, $s2;

  for ( my $i = 1; $i <= $n1; $i++ ) {
    for ( my $j = 1; $j <= $n2; $j++ ) {
      $d[$i][$j] = $d[$i-1][$j] + 1;
      if ($d[$i][$j-1] + 1 < $d[$i][$j]) {
        $d[$i][$j] = $d[$i][$j-1] + 1;
      }
      if ($t1[$i-1] eq $t2[$j-1]) {
        if ($d[$i-1][$j-1] < $d[$i][$j]) {
          $d[$i][$j] = $d[$i-1][$j-1];
        }
      } else {
        if ($d[$i-1][$j-1] + 1 < $d[$i][$j]) {
          $d[$i][$j] = $d[$i-1][$j-1] + 1;
        }
      }
    }
  }

  return \@d;
}

edit.distance <- function (s1,s2) {
  t1 <- strsplit(s1,split="")[[1]]
  t2 <- strsplit(s2,split="")[[1]]
  n1 <- length(t1)
  n2 <- length(t2)
  d <- array(0,dim=c(n1+1,n2+1))
  d[,1] <- 0:n1
  d[1,] <- 0:n2
  for (i in 2:(n1+1)) {
    for (j in 2:(n2+1)) {
      d[i,j] <- min(d[i-1,j]+1, d[i,j-1]+1)
      if (t1[i-1] == t2[j-1])
        d[i,j] <- min(d[i,j], d[i-1,j-1])
      else
        d[i,j] <- min(d[i,j], d[i-1,j-1]+1)
    }
  }
  d
}

Section 4.3.2: Alignment Based Comparison of Sequences

sub alignment {
  my $s1 = shift;
  my $s2 = shift;
  my $d = shift;
  my @d = @{$d};
  my ($t1, $t2) = ("", "");
  my @t1 = split //, $s1;
  my @t2 = split //, $s2;
  my ($n1, $n2) = (length $s1, length $s2);
  my ($i, $j) = ($n1, $n2);
  while ($i > 0 && $j > 0) {
    if ($d[$i][$j] eq $d[$i][$j-1]+1) {
      $t1 = "-" . $t1;
      $t2 = $t2[$j-1] . $t2;
      $j--;
    } elsif ($d[$i][$j] eq $d[$i-1][$j]+1) {
      $t1 = $t1[$i-1] . $t1;
      $t2 = "-" . $t2;
      $i--;
    } else {
      $t1 = $t1[$i-1] . $t1;
      $t2 = $t2[$j-1] . $t2;
      $i--;
      $j--;
    }
  }
  while ($j > 0) {
    $t1 = "-" . $t1;
    $t2 = $t2[$j-1] . $t2;
    $j--;
  }
  while ($i > 0) {
    $t1 = $t1[$i-1] . $t1;
    $t2 = "-" . $t2;
    $i--;
  }
  print "$t1\n$t2\n";
}

alignment <- function (s1,s2,d) {
  t1 <- strsplit(s1,split="")[[1]]
  t2 <- strsplit(s2,split="")[[1]]
  a1 <- a2 <- c()
  i <- length(t1)+1
  j <- length(t2)+1
  while (i > 1 && j > 1) {
    if (d[i,j] == d[i,j-1]+1) {
      a1 <- c("-",a1)
      a2 <- c(t2[j-1],a2)
      j <- j-1
    } else if (d[i,j] == d[i-1,j]+1) {
      a1 <- c(t1[i-1],a1)
      a2 <- c("-",a2)
      i <- i-1
    } else {
      a1 <- c(t1[i-1],a1)
      a2 <- c(t2[j-1],a2)
      i <- i-1
      j <- j-1
    }
  }
  while (j > 1) {
    a1 <- c("-",a1)
    a2 <- c(t2[j-1],a2)
    j <- j-1
  }
  while (i > 1) {
    a1 <- c(t1[i-1],a1)
    a2 <- c("-",a2)
    i <- i-1
  }
  print(paste(a1,collapse=""))
  print(paste(a2,collapse=""))
}

sub local_alignment {
  my ($s1, $s2, $match, $mismatch, $gap) = @_;
  my ($n1, $n2) = ( length $s1, length $s2);

  my @s;
  for (my $i = 0; $i <= $n1; $i++) { $s[$i][0] = 0; }
  for (my $j = 0; $j <= $n2; $j++) { $s[0][$j] = 0; }

  my @t1 = split //, $s1;
  my @t2 = split //, $s2;

  for ( my $i = 1; $i <= $n1; $i++ ) {
    for ( my $j = 1; $j <= $n2; $j++ ) {
      if ($t1[$i-1] eq $t2[$j-1]) {
        $s[$i][$j] = $s[$i-1][$j-1] + $match;
      } else {
        $s[$i][$j] = $s[$i-1][$j-1] + $mismatch;
      }
      $s[$i][$j] = $s[$i-1][$j] + $gap
        if $s[$i-1][$j] + $gap > $s[$i][$j];
      $s[$i][$j] = $s[$i][$j-1] + $gap
        if $s[$i][$j-1] + $gap > $s[$i][$j];
      $s[$i][$j] = 0 if 0 > $s[$i][$j];
    }
  }

  my $max = 0;
  my @opt;
  for ( my $i = 1; $i <= $n1; $i++ ) {
    for ( my $j = 1; $j <= $n2; $j++ ) {
      if ($s[$i][$j] > $max) {
        $max = $s[$i][$j];
        @opt = ();
      }
      if ($s[$i][$j] == $max) {
        $max = $s[$i][$j];
        push @opt, $i, $j;
      }
    }
  }

  while (@opt) {
    my $i = shift @opt;
    my $j = shift @opt;
    my ($t1, $t2) = ("", "");
    while ($s[$i][$j] != 0) {
      if (($t1[$i-1] eq $t2[$j-1] && $s[$i][$j] eq
          $s[$i-1][$j-1] + $match) || ($t1[$i-1] ne $t2[$j-1] &&
          $s[$i][$j] eq $s[$i-1][$j-1] + $mismatch)) {
        $t1 = $t1[$i-1] . $t1;
        $t2 = $t2[$j-1] . $t2;
        $i--;
        $j--;
      } elsif ($s[$i][$j] eq $s[$i-1][$j] + $gap) {
        $t1 = $t1[$i-1] . $t1;
        $t2 = "-" . $t2;
        $i--;
      } elsif ($s[$i][$j] eq $s[$i][$j-1] + $gap) {
        $t1 = "-" . $t1;
        $t2 = $t2[$j-1] . $t2;
        $j--;
      }
    }
    print "$t1\n$t2\n";
  }
}

local.alignment <- function
    (s1,s2,match,mismatch,gap) {
  t1 <- strsplit(s1,split="")[[1]]
  t2 <- strsplit(s2,split="")[[1]]
  n1 <- length(t1)
  n2 <- length(t2)
  s <- array(0,dim=c(n1+1,n2+1))
  s[,1] <- 0
  s[1,] <- 0
  for (i in 2:(n1+1)) {
    for (j in 2:(n2+1)) {
      if (t1[i-1] == t2[j-1])
        s[i,j] <- s[i-1,j-1] + match
      else
        s[i,j] <- s[i-1,j-1] + mismatch
      s[i,j] <- max(
        s[i,j],
        s[i-1,j] + gap,
        s[i,j-1] + gap,
        0)
    }
  }
  opt <- which(s==max(s),arr.ind=TRUE)
  for (sol in 1:nrow(opt)) {
    i <- opt[sol,1]
    j <- opt[sol,2]
    m1 <- c()
    m2 <- c()
    while (s[i,j] != 0) {
      if ((t1[i-1] == t2[j-1] && s[i,j] ==
            s[i-1,j-1] + match) ||
          (t1[i-1] != t2[j-1] && s[i,j] ==
            s[i-1,j-1] + mismatch)) {
        m1 <- c(t1[i-1],m1)
        m2 <- c(t2[j-1],m2)
        i <- i - 1
        j <- j - 1
      } else if (s[i,j] == s[i-1,j] + gap) {
        m1 <- c(t1[i-1],m1)
        m2 <- c("-",m2)
        i <- i - 1
      } else if (s[i,j] == s[i,j-1] + gap) {
        m1 <- c("-",m1)
        m2 <- c(t2[j-1],m2)
        j <- j - 1
      }
    }
    print(paste(m1,collapse=""))
    print(paste(m2,collapse=""))
  }
}

Chapter 5: Trees

Section 5.1: Trees in Mathematics

Section 5.1.1: Counting Labeled Trees

Section 5.2: Trees in Computer Science

Section 5.2.1: Traversing Rooted Trees

Section 5.3: Trees in Computational Biology

Section 5.3.1: The Newick Linear Representation

Section 5.3.2: Counting Phylogenetic Trees

Section 5.3.3: Generating Phylogenetic Trees

Section 5.3.4: Representing Trees in Perl

Section 5.3.5: Representing Trees in R

Chapter 6: Simple Pattern Matching in Trees

Section 6.1: Finding Paths in Unrooted Trees

Section 6.1.1: Distances in Unrooted Trees

Section 6.1.2: The Partition Distance between Unrooted Trees

sub partition {
  my $tree = shift;
  my %partition = ();
  for my $node ($tree->get_nodes) {
    next if $node->is_Leaf; # discard terminal nodes
    next unless $node->ancestor; # discard root
    my @a = sort { $a cmp $b } map { $_->id }
      grep {$_->is_Leaf } $node->get_all_Descendents;
    my $aa = join ",", @a;
    my @b = ();
    my %count = ();
    foreach my $label (@a, map { $_->id } $tree->get_leaf_nodes) {
      $count{$label}++
    }
    foreach my $label (keys %count) {
      push @b, $label if $count{$label} == 1;
    }
    my $bb = join ",", sort { $a cmp $b } @b;
    my $p = $aa le $bb ? "$aa:$bb" : "$bb:$aa";
    $partition{$p} = 1;
  }
  my @partition = sort keys %partition;
  return \@partition;
}

sub partition_distance {
  my $tree1 = shift;
  my $tree2 = shift;
  my @partition1 = @{ partition($tree1) };
  my @partition2 = @{ partition($tree2) };
  my $dist = 0;
  my %count = ();
  foreach my $p (@partition1, @partition2) {
    $count{$p}++
  }
  foreach my $p (keys %count) {
    $dist++ if $count{$p} == 1;
  }
  return $dist;
}

library(ape)

dist.topo(t1,t2)

Section 6.1.3: The Nodal Distance between Unrooted Trees

sub distances {
  my $tree = shift;
  my @leaves = sort {$a->id cmp $b->id} $tree->get_leaf_nodes;
  my @labels = map { $_->id } @leaves;
  my $n = scalar @labels;
  my @dist;
  for my $i (1..$n-1) {
    my @nodes = $tree->find_node(-id => $labels[$i-1]);
    for my $j ($i+1..$n) {
      push @nodes, $tree->find_node(-id => $labels[$j-1]);
      push @dist, $tree->distance(-nodes => \@nodes);
      pop @nodes;
    }
  }
  return \@dist;
}

sub nodal_distance {
  my $tree1 = shift;
  my $tree2 = shift;
  my @dist1 = @{ distances($tree1) };
  my @dist2 = @{ distances($tree2) };
  my ($dist, $d1, $d2);
  while (@dist1 and @dist2) {
    $d1 = pop @dist1;
    $d2 = pop @dist2;
    $dist += abs($d1-$d2);
  }
  return $dist;
}

library(ape)

nodal.distance <- function (t1,t2) {
    m1<-cophenetic.phylo(t1)
    m1<-m1[order(rownames(m1)),order(colnames(m1))]
    n1<-m1[upper.tri(m1)]
    m2<-cophenetic.phylo(t2)
    m2<-m2[order(rownames(m2)),order(colnames(m2))]
    n2<-m2[upper.tri(m2)]
    sum(abs(n1-n2))
}

Section 6.2: Finding Paths in Rooted Trees

Section 6.2.1: Distances in Rooted Trees

Section 6.2.2: The Partition Distance between Rooted Trees

Section 6.2.3: The Nodal Distance between Rooted Trees

Chapter 7: General Pattern Matching in Trees

Section 7.1: Finding Subtrees

Section 7.1.1: Finding Subtrees Induced by Triplets

sub triplet {
  my $tree = shift;
  my $i = shift;
  my $j = shift;
  my $k = shift;

  my @ij = grep { $_->id =~ /$i|$j/ } $tree->get_leaf_nodes;
  my @ik = grep { $_->id =~ /$i|$k/ } $tree->get_leaf_nodes;
  my @jk = grep { $_->id =~ /$j|$k/ } $tree->get_leaf_nodes;

  my $ij = $tree->get_lca(-nodes => \@ij);
  my $ik = $tree->get_lca(-nodes => \@ik);
  my $jk = $tree->get_lca(-nodes => \@jk);

  my $str;
  if ($ik == $jk) {
    $str = "(($i,$j),$k);";
  } else {
    if ($ij == $jk) {
      $str = "(($i,$k),$j);";
    } else {
      $str = "(($j,$k),$i);";
    }
  }

  return $str;
}

library(ape)

triplet <- function (t,i,j,k) {
    ij <- mrca(t)[i,j]
    ik <- mrca(t)[i,k]
    jk <- mrca(t)[j,k]
    if (ik == jk)
      paste("((",i,",",j,"),",k,");",sep="")
    else
      if (ij == jk)
        paste("((",i,",",k,"),",j,");",sep="")
      else
        paste("((",j,",",k,"),",i,");",sep="")
}

Section 7.1.2: Finding Subtrees Induced by Quartets

use Clone qw(clone);

sub quartet {
  my ($tree,$i,$j,$k,$l) = @_;
  my $copy = clone $tree;

  map { $copy->remove_Node($_) }
    grep { !($_->id =~ /$i|$j|$k|$l/) }
    $copy->get_leaf_nodes;

  return $copy;
}

library(ape)

quartet <- function (t,i,j,k,l) {
  unroot(drop.tip(t,setdiff(t$tip.label,c(i,j,k,l))))
}

Section 7.2: Finding Common Subtrees

Section 7.2.1: Maximum Agreement of Rooted Trees

	parent <- function (tree,node) tree$edge[which(tree$edge[,2]==node),1]
	children <- function (tree,node) tree$edge[which(tree$edge[,1]==node),2]
	is.root <- function (tree,node) length(tree$tip.label)+1==node
	is.ancestor <- function(tree,anc,des) { if (anc==des) return(TRUE) if (is.root(tree,des)) return(FALSE) else return(is.ancestor(tree,anc,parent(tree,des))) }
	postorder <- function (tree) { r <- length(tree$tip.label)+1 postorder.traversal(tree,r,c()) }
	postorder.traversal <- function (tree,v,res) { for (w in t1$edge[which(tree$edge[,1]==v),2]) res <- postorder.traversal(tree,w,res) c(res,v) }
sub mast_size { my $tree1 = shift; my $tree2 = shift; my (@m, $m); for my $node1 (@{ $tree1->Bio::Tree::Compatible::postorder_traversal }) { my $n1 = $node1->internal_id; for my $node2 (@{ $tree2->Bio::Tree::Compatible::postorder_traversal }) { my $n2 = $node2->internal_id; if ($node1->is_Leaf) { if ($node2->is_Leaf) { $m = ($node1->id eq $node2->id ? 1 : 0); } else { my $label1 = $node1->id; $m = (grep { /$label1/ } map { $_->id } grep { $_->is_Leaf } $node2->get_all_Descendents) ? 1 : 0; } } else { if ($node2->is_Leaf) { my $label2 = $node2->id; $m = (grep { /$label2/ } map { $_->id } grep { $_->is_Leaf } $node1->get_all_Descendents) ? 1 : 0; } else { my ($l1,$r1) = map { $_->internal_id } $node1->each_Descendent; my ($l2,$r2) = map { $_->internal_id } $node2->each_Descendent; $m = $m[$l1][$l2] + $m[$r1][$r2]; if ($m[$l1][$r2] + $m[$r1][$l2] > $m) { $m = $m[$l1][$r2] + $m[$r1][$l2]; } if ($m[$n1][$l2] > $m) { $m = $m[$n1][$l2]; } if ($m[$n1][$r2] > $m) { $m = $m[$n1][$r2]; } if ($m[$l1][$n2] > $m) { $m = $m[$l1][$n2]; } if ($m[$r1][$n2] > $m) { $m = $m[$r1][$n2]; } } } $m[$n1][$n2] = $m; } } return $m; }	mast.size <- function (t1,t2) { po1 <- postorder(t1) po2 <- postorder(t2) m <- matrix(0,nrow=length(po1),ncol=length(po2),dimnames=list(po1,po2)) for (i in po1) { for (j in po2) { if (length(t1$edge[which(t1$edge[,1]==i),1])==0) { if (length(t2$edge[which(t2$edge[,1]==j),1])==0) { if (t1$tip.label[i]==t2$tip.label[j]) m[i,j] <- 1 } else { jj <- which(t2$tip.label==t1$tip.label[i]) if (is.ancestor(t2,j,jj)) m[i,j] <- 1 } } else { if (length(t2$edge[which(t2$edge[,1]==j),1])==0) { ii <- which(t1$tip.label==t2$tip.label[j]) if (is.ancestor(t1,i,ii)) m[i,j] <- 1 } else { l1 <- children(t1,i)[1] r1 <- children(t1,i)[2] l2 <- children(t2,j)[1] r2 <- children(t2,j)[2] m[i,j] <- max(m[l1,l2]+m[r1,r2],m[l1,r2]+m[r1,l2], m[i,l2],m[i,r2],m[l1,j],m[r1,j]) } } } } m <- m[po1,po2] dimnames(m) <- list(po1,po2) m }
sub mast { my $tree1 = shift; my $tree2 = shift; my @m; for my $node1 (@{ $tree1->Bio::Tree::Compatible::postorder_traversal }) { my $n1 = $node1->internal_id; for my $node2 (@{ $tree2->Bio::Tree::Compatible::postorder_traversal }) { my $n2 = $node2->internal_id; my @mm = (); if ($node1->is_Leaf) { if ($node2->is_Leaf) { if ($node1->id eq $node2->id) { @mm = ($node1->id); } } else { my $label1 = $node1->id; if (grep { /$label1/ } map { $_->id } grep { $_->is_Leaf } $node2->get_all_Descendents) { @mm = ($label1); } } } else { if ($node2->is_Leaf) { my $label2 = $node2->id; if (grep { /$label2/ } map { $_->id } grep { $_->is_Leaf } $node1->get_all_Descendents) { @mm = ($label2); } } else { my ($l1,$r1) = map { $_->internal_id } $node1->each_Descendent; my ($l2,$r2) = map { $_->internal_id } $node2->each_Descendent; @mm = @{$m[$l1][$l2]}; push @mm, @{$m[$r1][$r2]}; if (scalar(@{$m[$l1][$r2]})+scalar(@{$m[$r1][$l2]})>scalar(@mm)) { @mm = @{$m[$l1][$r2]}; push @mm, @{$m[$r1][$l2]}; } if (scalar(@{$m[$n1][$l2]})>scalar(@mm)) { @mm = @{$m[$n1][$l2]}; } if (scalar(@{$m[$n1][$r2]})>scalar(@mm)) { @mm = @{$m[$n1][$r2]}; } if (scalar(@{$m[$l1][$n2]})>scalar(@mm)) { @mm = @{$m[$l1][$n2]}; } if (scalar(@{$m[$r1][$n2]})>scalar(@mm)) { @mm = @{$m[$r1][$n2]}; } } } $m[$n1][$n2] = \@mm; } } return \@m; }	mast <- function (t1,t2) { po1 <- postorder(t1) po2 <- postorder(t2) m <- matrix(rep(list(),length(po1)),nrow=length(po1),ncol=length(po2), dimnames=list(po1,po2)) for (i in po1) { for (j in po2) { mm <- list() if (length(t1$edge[which(t1$edge[,1]==i),1])==0) { if (length(t2$edge[which(t2$edge[,1]==j),1])==0) { if (t1$tip.label[i]==t2$tip.label[j]) mm <- c(t1$tip.label[i]) } else { jj <- which(t2$tip.label==t1$tip.label[i]) if (is.ancestor(t2,j,jj)) mm <- c(t1$tip.label[i]) } } else { if (length(t2$edge[which(t2$edge[,1]==j),1])==0) { ii <- which(t1$tip.label==t2$tip.label[j]) if (is.ancestor(t1,i,ii)) mm <- c(t2$tip.label[j]) } else { l1 <- children(t1,i)[1] r1 <- children(t1,i)[2] l2 <- children(t2,j)[1] r2 <- children(t2,j)[2] mm <- c(m[[l1,l2]],m[[r1,r2]]) if ( length(m[[l1,r2]]) + length(m[[r1,l2]]) > length(mm) ) mm <- c(m[[l1,r2]],m[[r1,l2]]) if ( length(m[[i,l2]]) > length(mm) ) mm <- m[[i,l2]] if ( length(m[[i,r2]]) > length(mm) ) mm <- m[[i,r2]] if ( length(m[[l1,j]]) > length(mm) ) mm <- m[[l1,j]] if ( length(m[[r1,j]]) > length(mm) ) mm <- m[[r1,j]] } } m[[i,j]] <- mm } } unlist(m[[length(t1$tip.label)+1,length(t2$tip.label)+1]]) }

Section 7.2.2: Maximum Agreement of Unrooted Trees

Section 7.3: Comparing Trees

Section 7.3.1: The Triplets Distance between Rooted Trees

sub triplets_distance {
  my $tree1 = shift;
  my $tree2 = shift;
  my @leaves = sort {$a->id cmp $b->id} $tree1->get_leaf_nodes;
  my @labels = map { $_->id } @leaves;
  my $dist = 0;
  for (my $i = 0; $i < @leaves; $i++) {
    for (my $j = $i+1; $j < @leaves; $j++) {
      for (my $k = $j+1; $k < @leaves; $k++) {
        my $t1 = triplet($tree1,$labels[$i],$labels[$j],$labels[$k]);
        my $t2 = triplet($tree2,$labels[$i],$labels[$j],$labels[$k]);
        $dist += 2 unless ($t1 eq $t2);
      }
    }
  }
  return $dist;
}

triplets.distance <- function (t1,t2) {
  L <- sort(t1$tip.label)
  d <- 0
  for (i in L[1:(length(L)-2)]) {
    for (j in L[(match(i,L)+1):(length(L)-1)]) {
      for (k in L[(match(j,L)+1):length(L)]) {
        str1 <- triplet(t1,i,j,k)
        str2 <- triplet(t2,i,j,k)
        if (str1 != str2) { d <- d + 2 }
      }
    }
  }
  d
}

Section 7.3.2: The Quartets Distance between Unrooted Trees

sub quartets_distance {
  my $tree1 = shift;
  my $tree2 = shift;
  my @leaves = sort {$a->id cmp $b->id} $tree1->get_leaf_nodes;
  my @labels = map { $_->id } @leaves;
  my $dist = 0;
  for (my $i = 0; $i < @leaves; $i++) {
    for (my $j = $i+1; $j < @leaves; $j++) {
      for (my $k = $j+1; $k < @leaves; $k++) {
        for (my $l = $k+1; $l < @leaves; $l++) {
          my $q1 = quartet($tree1,
            $labels[$i],$labels[$j],$labels[$k],$labels[$l]);
          my $q2 = quartet($tree2,
            $labels[$i],$labels[$j],$labels[$k],$labels[$l]);
          if (partition_distance($q1,$q2) != 0) {
            $dist += 2;
          }
        }
      }
    }
  }
  return $dist;
}

quartets.distance <- function (t1,t2) {
  L <- sort(t1$tip.label)
  d <- 0
  for (i in L[1:(length(L)-3)]) {
    for (j in L[(match(i,L)+1):(length(L)-2)]) {
      for (k in L[(match(j,L)+1):(length(L)-1)]) {
        for (l in L[(match(k,L)+1):length(L)]) {
          q1 <- quartet(t1,i,j,k,l)
          q2 <- quartet(t2,i,j,k,l)
          if (dist.topo(q1,q2) != 0) { d <- d + 2 }
        }
      }
    }
  }
  d
}

Chapter 8: Graphs

Section 8.1: Graphs in Mathematics

Section 8.1.1: Counting Labeled Graphs

Section 8.2: Graphs in Computer Science

Section 8.2.1: Traversing Directed Graphs

Section 8.3: Graphs in Computational Biology

Section 8.3.1: The eNewick Linear Representation

Section 8.3.2: Counting Phylogenetic Networks

Section 8.3.3: Generating Phylogenetic Networks

Section 8.3.4: Representing Graphs in Perl

Section 8.3.5: Representing Graphs in R

Chapter 9: Simple Pattern Matching in Graphs

Section 9.1: Finding Paths in Graphs

network.height <- function (net) {
  V(net)$height <- 0
  V(net)$visited <- FALSE
  Q <- as.vector(V(net)[which(degree(net,mode="out")==0)-1])
  while (length(Q)>0) {
    j <- Q[1]
    Q <- Q[-1]
    V(net)[j]$visited <- TRUE
    I <- as.vector(V(net)[nei(j,mode="in")])
    for (i in I) {
      V(net)[i]$height <- max(V(net)[i]$height,V(net)[j]$height+1)
      if (all(V(net)[nei(i,mode="out")]$visited)) Q <- c(Q,i)
    }
  }
  V(net)$height
}

Section 9.1.1: Distances in Graphs

use Bio::PhyloNetwork; sub is_strict_ancestor { my $net = shift; my $u = shift; my $v = shift; my $dag = $net->graph->copy_graph; my @roots = $dag->source_vertices; my $root = shift @roots; $dag->delete_vertex($u); return !$dag->is_reachable($root,$v); }	library(igraph) is.strict.ancestor <- function (net,i,j) { if (length(get.all.shortest.paths(net, V(net)[i],V(net)[j],mode="out")) == 0) return(FALSE) r <- V(net)[which(degree(net,mode="in")==0)-1] if (i == r \|\| i == j) return(TRUE) net <- delete.vertices(net,i) if (i < j) j <- j - 1 # account for deleted node length(get.all.shortest.paths(net,r,V(net)[j],mode="out")) == 0 }
	is.ancestor <- function (net,i,j) { length(get.all.shortest.paths(net,V(net)[i],V(net)[j],mode="out")) != 0 }
sub CSA { my $net = shift; my $v = shift; my $w = shift; my $dag = $net->graph; my @csa = (); foreach my $u ( $net->nodes ) { if ( $dag->is_reachable($u,$v) and $dag->is_reachable($u,$w) ) { if (is_strict_ancestor($net,$u,$v) or is_strict_ancestor($net,$u,$w)) { push @csa, $u; } } } return \@csa; }	CSA <- function (net,i,j) { strict <- lapply(V(net),function (u) is.ancestor(net,u,i) && is.ancestor(net,u,j) && (is.strict.ancestor(net,u,i) \|\| is.strict.ancestor(net,u,j))) V(net)[unlist(strict)] }
sub LCSA { my $net = shift; my $v = shift; my $w = shift; my %height = $net->heights; my @CSA = @{ CSA($net,$v,$w) }; my $lcsa = shift @CSA; for my $node ( @CSA ) { if ( $height{$node} < $height{$lcsa} ) { $lcsa = $node; } } return $lcsa; }	LCSA <- function (net,i,j) { csa <- match(CSA(net,i,j),V(net)) V(net)[csa[which.min(network.height(net)[csa])]-1] }

Section 9.1.2: The Path Multiplicity Distance between Graphs

path.multiplicity <- function (net) {
  leaves <- V(net)[which(degree(net,mode="out")==0)-1]
  mu <- matrix(0,nrow=vcount(net),ncol=length(leaves),
      dimnames=list(sort(V(net)$name),sort(leaves$name)))
  for (v in sort(V(net)[leaves]$name)) mu[v,v] <- 1
  V(net)$visited <- FALSE
  Q <- as.vector(V(net)[which(degree(net,mode="out")==0)-1])
  while (length(Q)>0) {
    j <- Q[1]
    Q <- Q[-1]
    V(net)[j]$visited <- TRUE
    I <- as.vector(V(net)[nei(j,mode="in")])
    for (i in I) {
      mu[V(net)[i]$name,] <- mu[V(net)[i]$name,] + mu[V(net)[j]$name,]
      if (all(V(net)[nei(i,mode="out")]$visited)) Q <- c(Q,i)
    }
  }
  mu
}

lex.cmp <- function(vec1,vec2) {
  index <- which.min(vec1 == vec2)
  as.numeric(sign(vec1[index] - vec2[index]))
}

path.multiplicity.distance <- function (net1,net2) {
  mu1 <- path.multiplicity(net1)
  mu1 <- mu1[do.call("order",lapply(1:ncol(mu1),function (j) mu1[,j])),]
  mu2 <- path.multiplicity(net2)
  mu2 <- mu2[do.call("order",lapply(1:ncol(mu2),function (j) mu2[,j])),]
  i1 <- 1
  i2 <- 1
  c <- 0
  while (i1 <= nrow(mu1) && i2 <= nrow(mu2)) {
    lex <- lex.cmp(mu1[i1,],mu2[i2,])
    if ( lex == -1 ) {
      i1 <- i1 + 1
    } else if ( lex == 1 ) {
      i2 <- i2 + 1
    } else {
      i1 <- i1 + 1
      i2 <- i2 + 1
      c <- c + 1
    }
  }
  nrow(mu1)+nrow(mu2)-2*c
}

Section 9.1.3: The Tripartition Distance between Graphs

tripartitions <- function (net) {
  leaves <- V(net)[which(degree(net,mode="out")==0)-1]
  theta <- matrix("",nrow=vcount(net),ncol=length(leaves),
      dimnames=list(sort(V(net)$name),sort(leaves$name)))
  for (i in V(net)) {
    for (j in leaves) {
      if ( is.ancestor(net,i,j) ) {
        if ( is.strict.ancestor(net,i,j) ) {
          theta[V(net)[i]$name,V(net)[j]$name] <- "A"
        } else {
          theta[V(net)[i]$name,V(net)[j]$name] <- "B"
        }
      } else {
        theta[V(net)[i]$name,V(net)[j]$name] <- "C"
      }
    }
  }
  theta
}

use Bio::PhyloNetwork;

sub tripartition_distance {
  my $net1 = shift;
  my $net2 = shift;

  $net1->compute_tripartitions() unless defined $net1->{tripartitions};
  $net2->compute_tripartitions() unless defined $net2->{tripartitions};

  my @tri1 = sort map {$net1->{tripartitions}->{$_}} $net1->nodes;
  my @tri2 = sort map {$net2->{tripartitions}->{$_}} $net2->nodes;

  my $i1 = 0;
  my $i2 = 0;
  my $c = 0;
  while ($i1 < scalar @tri1 && $i2 < scalar @tri2) {
    if ($tri1[$i1] lt $tri2[$i2]) {
      $i1++;
    } elsif ($tri1[$i1] gt $tri2[$i2]) {
      $i2++;
    } else {
      $i1++;
      $i2++;
      $c++;
    }
  }

  return scalar @tri1+scalar @tri2-2*$c;
}

library(igraph)

lex.cmp.char <- function (vec1,vec2) {
  for (j in 1:length(vec1)) {
    if (vec1[j] < vec2[j]) { return(-1) }
    if (vec1[j] > vec2[j]) { return(1) }
  }
  return(0)
}

tripartition.distance <- function (net1,net2) {
  theta1 <- tripartitions(net1)
  theta1 <- theta1[do.call("order",lapply(1:ncol(theta1),
      function (j) theta1[,j])),]
  theta2 <- tripartitions(net2)
  theta2 <- theta2[do.call("order",lapply(1:ncol(theta2),
      function (j) theta2[,j])),]
  i1 <- 1
  i2 <- 1
  c <- 0
  while (i1 <= nrow(theta1) && i2 <= nrow(theta2)) {
    lex <- lex.cmp.char(theta1[i1,],theta2[i2,])
    if ( lex == -1 ) {
      i1 <- i1 + 1
    } else if ( lex == 1 ) {
      i2 <- i2 + 1
    } else {
      i1 <- i1 + 1
      i2 <- i2 + 1
      c <- c + 1
    }
  }
  nrow(theta1)+nrow(theta2)-2*c
}

Section 9.1.4: The Nodal Distance between Graphs

sub nodal_distance {
  my $net1 = shift;
  my $net2 = shift;
  my @L = $net1->leaves;
  my $dist = 0;
  for (my $i = 0; $i < @L; $i++) {
    for (my $j = $i+1; $j < @L; $j++) {
      my $lcsa1 = LCSA($net1,$L[$i],$L[$j]);
      my $lcsa2 = LCSA($net2,$L[$i],$L[$j]);
      my @p1 = $net1->graph->SP_Dijkstra($lcsa1,$L[$i]);
      my @p2 = $net2->graph->SP_Dijkstra($lcsa2,$L[$i]);
      my $d1 = @p1 - 1;
      my $d2 = @p2 - 1;
      $dist += abs($d1 - $d2);
      @p1 = $net1->graph->SP_Dijkstra($lcsa1,$L[$j]);
      @p2 = $net2->graph->SP_Dijkstra($lcsa2,$L[$j]);
      $d1 = @p1 - 1;
      $d2 = @p2 - 1;
      $dist += abs($d1 - $d2);
    }
  }
  return $dist;
}

library(igraph)

nodal.distance <- function (net1,net2) {
  l1 <- V(net1)[which(degree(net1,mode="out")==0)-1]
  n1 <- matrix(0,nrow=length(l1),ncol=length(l1),
      dimnames=list(sort(l1$name),sort(l1$name)))
  l2 <- V(net2)[which(degree(net2,mode="out")==0)-1]
  n2 <- matrix(0,nrow=length(l2),ncol=length(l2),
      dimnames=list(sort(l2$name),sort(l2$name)))
  for (i in l1) {
    for (j in c(l1)[match(i,l1):length(l1)]) {
      lcsa <- LCSA(net1,i,j)
      paths <- get.shortest.paths(net1,lcsa,i,mode="out")
      n1[V(net1)[i]$name,V(net1)[j]$name] <- length(paths[[1]])-1
      paths <- get.shortest.paths(net1,lcsa,j,mode="out")
      n1[V(net1)[j]$name,V(net1)[i]$name] <- length(paths[[1]])-1
    }
  }
  for (i in l2) {
    for (j in c(l2)[match(i,l2):length(l2)]) {
      lcsa <- LCSA(net2,i,j)
      paths <- get.shortest.paths(net2,lcsa,i,mode="out")
      n2[V(net2)[i]$name,V(net2)[j]$name] <- length(paths[[1]])-1
      paths <- get.shortest.paths(net2,lcsa,j,mode="out")
      n2[V(net2)[j]$name,V(net2)[i]$name] <- length(paths[[1]])-1
    }
  }
  sum(abs(n1-n2))
}

Section 9.2: Finding Trees in Graphs

sub explode_rec {
  my ($self,$trees) = @_;
  my @h = $self->hybrid_nodes;
  if (scalar @h) {
    my $v = shift @h;
    for my $u ($self->{graph}->predecessors($v)) {
      $self->{graph}->delete_edge($u,$v);
      $self->explode_rec($trees);
      $self->{graph}->add_edge($u,$v);
    }
  } else {
    my $io = IO::String->new($self->eNewick);
    my $treeio = Bio::TreeIO->new(-format => 'newick', -fh => $io);
    my $tree = $treeio->next_tree;
    $tree->contract_linear_paths;
    push @{$trees}, $tree;
  }
}

sub explode {
  my ($self) = @_;
  my @trees;
  $self->explode_rec(\@trees);
  return @trees;
}

network.to.newick <- function (net) {
  r <- V(net)[which(degree(net,mode="in")==0)-1]
  newick <- obtain.newick(net,r,c())
  paste(c(newick,";"),collapse="")
}

obtain.newick <- function (net,v,newick) {
  if (length(V(net)[nei(v,"out")]) != 0) {
    if ( length(V(net)[nei(v,"in")]) != 0
        && match(v,V(net)[nei(V(net)[nei(v,"in")],"out")]) != 1 )
      newick <- c(newick,",")
    newick <- c(newick,"(")
  }
  for (w in V(net)[nei(v,"out")]) {
    newick <- obtain.newick(net,w,newick)
  }
  if (length(V(net)[nei(v,"out")]) != 0)
    newick <- c(newick,")")
  if ( length(V(net)[nei(v,"in")]) != 0
      && length(V(net)[nei(v,"out")]) == 0
      && match(v,V(net)[nei(V(net)[nei(v,"in")],"out")]) != 1 )
    newick <- c(newick,",")
  c(newick,V(net)[v]$name)
}

network.to.tree <- function (net) {
  V(net)$del <- FALSE
  E(net)$del <- FALSE
  r <- V(net)[which(degree(net,mode="in")==0)-1]
  net <- postorder.traversal(net,r)
  net <- delete.vertices(net,V(net)[del])
  net <- delete.edges(net,E(net)[del])
  network.to.newick(net)
}

postorder.traversal <- function (net,v) {
  for (w in V(net)[nei(v,"out")]) {
    net <- postorder.traversal(net,w)
  }
  VW <- E(net)[from == v & del==FALSE]
  if (length(c(VW)) == 1) {
    u <- V(net)[nei(v,"in")]
    w <- get.edge(net,VW)[2]
    E(net,path=c(V(net)[u],V(net)[v]))$del <- TRUE
    E(net,path=c(V(net)[v],V(net)[w]))$del <- TRUE
    V(net)[v]$del <- TRUE
    net <- add.edges(net,c(V(net)[u],V(net)[w]))
    E(net,path=c(V(net)[u],V(net)[w]))$del <- FALSE
  }
  net
}

explode <- function (net) {
  trees <- c()
  H <- V(net)[which(degree(net,mode="in")>1)-1]
  if (length(H) == 0) {
    trees <- c(trees,network.to.tree(net))
  } else {
    v <- c(H)[1]
    for (u in V(net)[nei(v,"in")]) {
      new <- delete.edges(net,E(net,P=c(u,v)))
      trees <- c(trees,explode(new))
    }
  }
  trees
}

Section 9.2.1: The Statistical Error between Graphs

sub error_ratio {
  my $net1 = shift;
  my $net2 = shift;
  my @trees1 = $net1->explode;
  my @trees2 = $net2->explode;
  my $FN = 0;
  my $FP = 0;
  for my $tree1 (@trees1) {
    my $found = 0; # false
    for my $tree2 (@trees2) {
      if (partition_distance($tree1,$tree2) == 0) {
        $found = 1; # true
      }
    }
    if (!$found) { $FN++; }
  }
  for my $tree2 (@trees2) {
    my $found = 0; # false
    for my $tree1 (@trees1) {
      if (partition_distance($tree1,$tree2) == 0) {
        $found = 1; # true
      }
    }
    if (!$found) { $FP++; }
  }
  return (($FN/@trees1)+($FP/@trees2))/2;
}

error.rate <- function (net1,net2) {
  trees1 <- explode(net1)
  trees2 <- explode(net2)
  FN <- 0
  FP <- 0
  for (t1 in trees1) {
    tree1 <- read.tree(text=t1)
    found <- FALSE
    for (t2 in trees2) {
      tree2 <- read.tree(text=t2)
      if (dist.topo(tree1,tree2) == 0) {
        found <- TRUE
        break
      }
    }
    if (!found) FN <- FN+1
  }
  for (t2 in trees2) {
    tree2 <- read.tree(text=t2)
    found <- FALSE
    for (t1 in trees1) {
      tree1 <- read.tree(text=t1)
      if (dist.topo(tree1,tree2) == 0) {
        found <- TRUE
        break
      }
    }
    if (!found) FP <- FP+1
  }
  (FN/length(trees1)+FP/length(trees2))/2
}

Chapter 10: General Pattern Matching in Graphs

Section 10.1: Finding Subgraphs

Section 10.1.1: Finding Subgraphs Induced by Triplets

sub triplet {
  my ($net,@triple) = @_;
  my ($i,$j,$k) = @triple;

  my @L = $net->leaves;
  my $ij = LCSA($net,$L[$i],$L[$j]);
  my $ik = LCSA($net,$L[$i],$L[$k]);
  my $jk = LCSA($net,$L[$j],$L[$k]);

  my $t;
  if ($net->graph->SP_Dijkstra($ij,$ik)) {
    if ($net->graph->SP_Dijkstra($ik,$ij)) {
      $t = "(($L[$j],$L[$k]),$L[$i]);";
    } else {
      if ($net->graph->SP_Dijkstra($jk,$ij)) {
        $t = "(($L[$i],$L[$k]),$L[$j]);";
      } else {
        $t = "(($L[$i],($L[$k])#H1),(#H1,$L[$j]));";
      }
    }
  } else {
    if ($net->graph->SP_Dijkstra($ik,$ij)) {
      if ($net->graph->SP_Dijkstra($jk,$ij)) {
        $t = "(($L[$i],$L[$j]),$L[$k]);";
      } else {
        $t = "(($L[$i],($L[$j])#H1),(#H1,$L[$k]));";
      }
    } else {
      $t = "(($L[$j],($L[$i])#H1),(#H1,$L[$k]));";
    }
  }
  return $t;
}

triplet <- function (net,i,j,k) {
    ij <- LCSA(net,i,j)
    ik <- LCSA(net,i,k)
    jk <- LCSA(net,j,k)

    ij.ancestor.of.ik <- length(get.all.shortest.paths(net,
        ij,ik,mode="out")) != 0
    ik.ancestor.of.ij <- length(get.all.shortest.paths(net,
        ik,ij,mode="out")) != 0
    ij.ancestor.of.jk <- length(get.all.shortest.paths(net,
        ij,jk,mode="out")) != 0
    jk.ancestor.of.ij <- length(get.all.shortest.paths(net,
        jk,ij,mode="out")) != 0

    if (ij.ancestor.of.ik)
      if (ik.ancestor.of.ij)
        paste("((",V(net)[j]$name,",",V(net)[k]$name,"),",
            V(net)[i]$name,");",sep="")
      else
        if (jk.ancestor.of.ij)
          paste("((",V(net)[i]$name,",",V(net)[k]$name,"),",
              V(net)[j]$name,");",sep="")
        else
          paste("((",V(net)[i]$name,",(",V(net)[k]$name,"))#H,(#H,",
              V(net)[j]$name,"));",sep="")
    else
      if (ik.ancestor.of.ij)
        if (jk.ancestor.of.ij)
          paste("((",V(net)[i]$name,",",V(net)[j]$name,"),",
              V(net)[k]$name,");",sep="")
        else
          paste("((",V(net)[i]$name,",(",V(net)[j]$name,"))#H,(#H,",
              V(net)[k]$name,"));",sep="")
      else
        paste("((",V(net)[j]$name,",(",V(net)[i]$name,"))#H,(#H,",
            V(net)[k]$name,"));",sep="")
}

sub triplet {
  my ($net,@triple) = @_;
  my ($i,$j,$k) = @triple;

  my @L = $net->leaves;
  my $ij = LCSA($net,$L[$i],$L[$j]);
  my $ik = LCSA($net,$L[$i],$L[$k]);
  my $jk = LCSA($net,$L[$j],$L[$k]);

  my @R;
  $R[0] = 1;
  $R[1] = $net->graph->SP_Dijkstra($ij,$ik) ? 1 : 0;
  $R[2] = $net->graph->SP_Dijkstra($ij,$jk) ? 1 : 0;
  $R[3] = $net->graph->SP_Dijkstra($ik,$ij) ? 1 : 0;
  $R[4] = 1;
  $R[5] = $net->graph->SP_Dijkstra($ik,$jk) ? 1 : 0;
  $R[6] = $net->graph->SP_Dijkstra($jk,$ij) ? 1 : 0;
  $R[7] = $net->graph->SP_Dijkstra($jk,$ik) ? 1 : 0;
  $R[8] = 1;

  return join "", @R;
}

triplet <- function (net,i,j,k) {
  ij <- LCSA(net,i,j)
  ik <- LCSA(net,i,k)
  jk <- LCSA(net,j,k)

  L <- c(V(net)[i]$name,V(net)[j]$name,V(net)[k]$name)
  R <- matrix(rep(NA,9),nrow=3,dimnames=list(L,L))
  R[1,1] <- TRUE
  R[1,2] <- length(get.all.shortest.paths(net,ij,ik,mode="out")) != 0
  R[1,3] <- length(get.all.shortest.paths(net,ij,jk,mode="out")) != 0
  R[2,1] <- length(get.all.shortest.paths(net,ik,ij,mode="out")) != 0
  R[2,2] <- TRUE
  R[2,3] <- length(get.all.shortest.paths(net,ik,jk,mode="out")) != 0
  R[3,1] <- length(get.all.shortest.paths(net,jk,ij,mode="out")) != 0
  R[3,2] <- length(get.all.shortest.paths(net,jk,ik,mode="out")) != 0
  R[3,3] <- TRUE

  R
}

Section 10.2: Finding Common Subgraphs

Section 10.2.1: Maximum Agreement of Rooted Networks

sub bottom_up_subgraph { my $graph = shift; my $node = shift; my $sub = Graph->new; my @Q = ($node); while (@Q) { $node = shift @Q; $sub->add_vertex($node); push @Q, $graph->successors($node); } for my $edge ($graph->edges) { if ($sub->has_vertex(@$edge[0]) && $sub->has_vertex(@$edge[1])) { $sub->add_edge(@$edge[0],@$edge[1]); } } while (my @V = grep { $sub->in_degree($_)==1 && $sub->out_degree($_)==1 } $sub->interior_vertices) { my $v = shift @V; my @U = $sub->predecessors($v); my $u = shift @U; my @W = $sub->successors($v); my $w = shift @W; $sub->add_edge($u,$w); $sub->delete_vertex($v); } return $sub; }	bottom.up.subgraph <- function (net,i) { Q <- c(i) S <- c() while (length(Q)>0) { i <- Q[1] Q <- Q[-1] S <- c(S,i) Q <- c(Q,as.vector(V(net)[nei(i,mode="out")])) } g <- subgraph(net,unique(S)) while (length(V(g)[which(degree(g,mode="in")==1 & degree(g,mode="out")==1)-1])>0) { D <- V(g)[which(degree(g,mode="in")==1 & degree(g,mode="out")==1)-1] v <- head(c(D),n=1) # elementary node u <- V(g)[nei(v,"in")] w <- V(g)[nei(v,"out")] g <- add.edges(g,c(V(g)[u],V(g)[w])) g <- delete.vertices(g,V(g)[v]) } g }
sub graph_isomorphic { my $g1 = shift; my $g2 = shift; my $found = 0; if ($g1->could_be_isomorphic($g2)) { my $C1 = cluster_map($g1); my $C2 = cluster_map($g2); my %X; for my $node1 ($g1->vertices) { for my $node2 ($g2->vertices) { my $lc = List::Compare->new(@{$C1}{$node1}, @{$C2}{$node2}); if ($lc->is_LequivalentR) { push @{$X{$node1}}, $node2; } } } my $node1 = undef; my %M; my @V1 = sort $g1->vertices; backtrack($g1,$g2,\%X,$node1,\%M,\@V1,\$found); } return $found; }
sub backtrack { my ($g1,$g2,$X,$node1,$M,$V1,$found) = @_; my %X = %{$X}; my %M = %{$M}; my @V1 = @{$V1}; unless ($$found) { for my $prev1 (keys %M) { if ($g1->has_edge($prev1,$node1) && !$g2->has_edge($M{$prev1},$M{$node1}) \|\| $g1->has_edge($node1,$prev1) && !$g2->has_edge($M{$node1},$M{$prev1}) \|\| !$g1->has_edge($prev1,$node1) && $g2->has_edge($M{$prev1},$M{$node1}) \|\| !$g1->has_edge($node1,$prev1) && $g2->has_edge($M{$node1},$M{$prev1})) { return; } } if (scalar @V1) { $node1 = shift @V1; for my $node2 (@{$X{$node1}}) { $M{$node1} = $node2; backtrack($g1,$g2,$X,$node1,\%M,\@V1,$found); $M{$node1} = undef; } } else { $$found = 1; } } }
sub cluster_map { my $graph = shift; my %C; for my $node ($graph->vertices) { for my $leaf ($graph->successorless_vertices) { if ($graph->is_reachable($node,$leaf)) { push @{$C{$node}}, $leaf; } } } return \%C; }
sub max_agreement_subgraph { my $net1 = shift; my $net2 = shift; my $graph1 = $net1->graph; my $graph2 = $net2->graph; my $common = Graph->new; my $size = 0; for my $node1 ($graph1->interior_vertices) { my $sub1 = subgraph($graph1,$node1); for my $node2 ($graph2->interior_vertices) { my $sub2 = subgraph($graph2,$node2); if (graph_isomorphic($sub1,$sub2)) { my $leaves1 = $sub1->successorless_vertices; if ($leaves1 > $size) { $common = $sub1; $size = $leaves1; } } } } return $common; }	max.agreement.subgraph <- function (net1,net2) { max <- graph.empty() max.size <- 0 for (i in V(net1)) { g1 <- bottom.up.subgraph(net1,i) for (j in V(net2)) { g2 <- bottom.up.subgraph(net2,j) if (graph.isomorphic(g1,g2)) { L <- V(g1)[which(degree(g1,mode="out")==0)-1] if (length(L)>max.size) { max <- g1 max.size <- length(L) } } } } max }

Section 10.3: Comparing Graphs

Section 10.3.1: The Triplets Distance between Graphs

sub triplets_distance {
  my $net1 = shift;
  my $net2 = shift;
  my $comp = Array::Compare->new;
  my @L = $net1->leaves;
  my $dist = 0;
  for (my $i = 0; $i < @L; $i++) {
    for (my $j = $i+1; $j < @L; $j++) {
      for (my $k = $j+1; $k < @L; $k++) {
        my $t1 = triplet($net1,$i,$j,$k);
        my $t2 = triplet($net2,$i,$j,$k);
	$dist += 2 unless ($t1 eq $t2);
      }
    }
  }
  return $dist;
}

triplets.distance <- function (net1,net2) {
  L <- sort(V(net1)[which(degree(net1,mode="out")==0)-1]$name)
  d <- 0
  for (i in L[1:(length(L)-2)]) {
    i1 <- V(net1)[V(net1)$name == i]
    i2 <- V(net2)[V(net2)$name == i]
    for (j in L[(match(i,L)+1):(length(L)-1)]) {
      j1 <- V(net1)[V(net1)$name == j]
      j2 <- V(net2)[V(net2)$name == j]
      for (k in L[(match(j,L)+1):length(L)]) {
        k1 <- V(net1)[V(net1)$name == k]
        k2 <- V(net2)[V(net2)$name == k]
        t1 <- triplet(net1,i1,j1,k1)
        t2 <- triplet(net2,i2,j2,k2)
        if (!isTRUE(all.equal(t1,t2))) { d <- d + 2 }
      }
    }
  }
  d
}

Appendix A: Elements of Perl

Section A.1: Perl Scripts

sub rna_to_protein { my $rna = shift; my $protein; my $i = 0; while ($i < length($rna) - 2 && substr($rna,$i,3) ne "AUG") { # start codon $i++; } $i += 3; # skip the start codon while ($i < length($rna) - 2 && substr($rna,$i,3) ne "UAA" && substr($rna,$i,3) ne "UAG" && substr($rna,$i,3) ne "UGA") { my $codon = substr($rna,$i,3); $protein .= codon_to_amino_acid($codon); $i += 3; } return $protein; }
sub codon_to_amino_acid { my $codon = shift; if ($codon eq "UUU") { return "F" } # Phe elsif ($codon eq "UUC") { return "F" } # Phe elsif ($codon eq "UUA") { return "L" } # Leu elsif ($codon eq "UUG") { return "L" } # Leu elsif ($codon eq "UCU") { return "S" } # Ser elsif ($codon eq "UCC") { return "S" } # Ser elsif ($codon eq "UCA") { return "S" } # Ser elsif ($codon eq "UCG") { return "S" } # Ser elsif ($codon eq "UAU") { return "Y" } # Tyr elsif ($codon eq "UAC") { return "Y" } # Tyr elsif ($codon eq "UAA") { return "-" } # stop elsif ($codon eq "UAG") { return "-" } # stop elsif ($codon eq "UGU") { return "C" } # Cys elsif ($codon eq "UGC") { return "C" } # Cys elsif ($codon eq "UGA") { return "-" } # stop elsif ($codon eq "UGG") { return "W" } # Trp elsif ($codon eq "CUU") { return "L" } # Leu elsif ($codon eq "CUC") { return "L" } # Leu elsif ($codon eq "CUA") { return "L" } # Leu elsif ($codon eq "CUG") { return "L" } # Leu elsif ($codon eq "CCU") { return "P" } # Pro elsif ($codon eq "CCC") { return "P" } # Pro elsif ($codon eq "CCA") { return "P" } # Pro elsif ($codon eq "CCG") { return "P" } # Pro elsif ($codon eq "CAU") { return "H" } # His elsif ($codon eq "CAC") { return "H" } # His elsif ($codon eq "CAA") { return "Q" } # Gln elsif ($codon eq "CAG") { return "Q" } # Gln elsif ($codon eq "CGU") { return "R" } # Arg elsif ($codon eq "CGC") { return "R" } # Arg elsif ($codon eq "CGA") { return "R" } # Arg elsif ($codon eq "CGG") { return "R" } # Arg elsif ($codon eq "AUU") { return "I" } # Ile elsif ($codon eq "AUC") { return "I" } # Ile elsif ($codon eq "AUA") { return "I" } # Ile elsif ($codon eq "AUG") { return "M" } # Met elsif ($codon eq "ACU") { return "T" } # Thr elsif ($codon eq "ACC") { return "T" } # Thr elsif ($codon eq "ACA") { return "T" } # Thr elsif ($codon eq "ACG") { return "T" } # Thr elsif ($codon eq "AAU") { return "N" } # Asn elsif ($codon eq "AAC") { return "N" } # Asn elsif ($codon eq "AAA") { return "K" } # Lys elsif ($codon eq "AAG") { return "K" } # Lys elsif ($codon eq "AGU") { return "S" } # Ser elsif ($codon eq "AGC") { return "S" } # Ser elsif ($codon eq "AGA") { return "R" } # Arg elsif ($codon eq "AGG") { return "R" } # Arg elsif ($codon eq "GUU") { return "V" } # Val elsif ($codon eq "GUC") { return "V" } # Val elsif ($codon eq "GUA") { return "V" } # Val elsif ($codon eq "GUG") { return "V" } # Val elsif ($codon eq "GCU") { return "A" } # Ala elsif ($codon eq "GCC") { return "A" } # Ala elsif ($codon eq "GCA") { return "A" } # Ala elsif ($codon eq "GCG") { return "A" } # Ala elsif ($codon eq "GAU") { return "D" } # Asp elsif ($codon eq "GAC") { return "D" } # Asp elsif ($codon eq "GAA") { return "E" } # Glu elsif ($codon eq "GAG") { return "E" } # Glu elsif ($codon eq "GGU") { return "G" } # Gly elsif ($codon eq "GGC") { return "G" } # Gly elsif ($codon eq "GGA") { return "G" } # Gly elsif ($codon eq "GGG") { return "G" } # Gly else { return "*" } }
my %codon2aa = ( "UUU" => "F", # Phe "UUC" => "F", # Phe "UUA" => "L", # Leu "UUG" => "L", # Leu "UCU" => "S", # Ser "UCC" => "S", # Ser "UCA" => "S", # Ser "UCG" => "S", # Ser "UAU" => "Y", # Tyr "UAC" => "Y", # Tyr "UAA" => "-", # stop "UAG" => "-", # stop "UGU" => "C", # Cys "UGC" => "C", # Cys "UGA" => "-", # stop "UGG" => "W", # Trp "CUU" => "L", # Leu "CUC" => "L", # Leu "CUA" => "L", # Leu "CUG" => "L", # Leu "CCU" => "P", # Pro "CCC" => "P", # Pro "CCA" => "P", # Pro "CCG" => "P", # Pro "CAU" => "H", # His "CAC" => "H", # His "CAA" => "Q", # Gln "CAG" => "Q", # Gln "CGU" => "R", # Arg "CGC" => "R", # Arg "CGA" => "R", # Arg "CGG" => "R", # Arg "AUU" => "I", # Ile "AUC" => "I", # Ile "AUA" => "I", # Ile "AUG" => "M", # Met "ACU" => "T", # Thr "ACC" => "T", # Thr "ACA" => "T", # Thr "ACG" => "T", # Thr "AAU" => "N", # Asn "AAC" => "N", # Asn "AAA" => "K", # Lys "AAG" => "K", # Lys "AGU" => "S", # Ser "AGC" => "S", # Ser "AGA" => "R", # Arg "AGG" => "R", # Arg "GUU" => "V", # Val "GUC" => "V", # Val "GUA" => "V", # Val "GUG" => "V", # Val "GCU" => "A", # Ala "GCC" => "A", # Ala "GCA" => "A", # Ala "GCG" => "A", # Ala "GAU" => "D", # Asp "GAC" => "D", # Asp "GAA" => "E", # Glu "GAG" => "E", # Glu "GGU" => "G", # Gly "GGC" => "G", # Gly "GGA" => "G", # Gly "GGG" => "G" # Gly );
sub rna_to_protein { my $rna = shift; my $protein; my $aa; my $i = 0; while ($i < length($rna) - 2 && substr($rna,$i,3) ne "AUG") { # start codon $i++; } $i += 3; # skip the start codon while ($i < length($rna) - 2 && substr($rna,$i,3) ne "UAA" && substr($rna,$i,3) ne "UAG" && substr($rna,$i,3) ne "UGA") { my $codon = substr($rna,$i,3); if (defined $codon2aa{$codon}) { $aa = $codon2aa{$codon}; } else { $aa = "*"; } $protein .= $aa; $i += 3; } return $protein; }

Section A.2: Overview of Perl

package DNA;

sub reverse_complement {
  my $seq = shift;
  $seq = reverse $seq;
  $seq =~ tr/ACGT/TGCA/;
  return $seq;
}

package RNA;

sub reverse_complement {
  my $seq = shift;
  $seq = reverse $seq;
  $seq =~ tr/ACGU/UGCA/;
  return $seq;
}

1;

Section A.3: Perl Quick Reference Card

Appendix B: Elements of R

Section B.1: R Scripts

	rna.to.protein <- function (rna) { protein <- "" i <- 1 while (i < nchar(rna)-1 && substr(rna,i,i+2) != "AUG") { # start codon i <- i+1 } i <- i+3 # skip the start codon while (i < nchar(rna)-1 && substr(rna,i,i+2) != "UAA" && substr(rna,i,i+2) != "UAG" && substr(rna,i,i+2) != "UGA") { codon <- substr(rna,i,i+2) protein <- paste(protein, codon.to.amino.acid(codon),sep="") i <- i+3 } return(protein) }
	codon.to.amino.acid <- function (codon) { if (codon == "UUU") return("F") # Phe else if (codon == "UUC") return("F") # Phe else if (codon == "UUA") return("L") # Leu else if (codon == "UUG") return("L") # Leu else if (codon == "UCU") return("S") # Ser else if (codon == "UCC") return("S") # Ser else if (codon == "UCA") return("S") # Ser else if (codon == "UCG") return("S") # Ser else if (codon == "UAU") return("Y") # Tyr else if (codon == "UAC") return("Y") # Tyr else if (codon == "UAA") return("-") # stop else if (codon == "UAG") return("-") # stop else if (codon == "UGU") return("C") # Cys else if (codon == "UGC") return("C") # Cys else if (codon == "UGA") return("-") # stop else if (codon == "UGG") return("W") # Trp else if (codon == "CUU") return("L") # Leu else if (codon == "CUC") return("L") # Leu else if (codon == "CUA") return("L") # Leu else if (codon == "CUG") return("L") # Leu else if (codon == "CCU") return("P") # Pro else if (codon == "CCC") return("P") # Pro else if (codon == "CCA") return("P") # Pro else if (codon == "CCG") return("P") # Pro else if (codon == "CAU") return("H") # His else if (codon == "CAC") return("H") # His else if (codon == "CAA") return("Q") # Gln else if (codon == "CAG") return("Q") # Gln else if (codon == "CGU") return("R") # Arg else if (codon == "CGC") return("R") # Arg else if (codon == "CGA") return("R") # Arg else if (codon == "CGG") return("R") # Arg else if (codon == "AUU") return("I") # Ile else if (codon == "AUC") return("I") # Ile else if (codon == "AUA") return("I") # Ile else if (codon == "AUG") return("M") # Met else if (codon == "ACU") return("T") # Thr else if (codon == "ACC") return("T") # Thr else if (codon == "ACA") return("T") # Thr else if (codon == "ACG") return("T") # Thr else if (codon == "AAU") return("N") # Asn else if (codon == "AAC") return("N") # Asn else if (codon == "AAA") return("K") # Lys else if (codon == "AAG") return("K") # Lys else if (codon == "AGU") return("S") # Ser else if (codon == "AGC") return("S") # Ser else if (codon == "AGA") return("R") # Arg else if (codon == "AGG") return("R") # Arg else if (codon == "GUU") return("V") # Val else if (codon == "GUC") return("V") # Val else if (codon == "GUA") return("V") # Val else if (codon == "GUG") return("V") # Val else if (codon == "GCU") return("A") # Ala else if (codon == "GCC") return("A") # Ala else if (codon == "GCA") return("A") # Ala else if (codon == "GCG") return("A") # Ala else if (codon == "GAU") return("D") # Asp else if (codon == "GAC") return("D") # Asp else if (codon == "GAA") return("E") # Glu else if (codon == "GAG") return("E") # Glu else if (codon == "GGU") return("G") # Gly else if (codon == "GGC") return("G") # Gly else if (codon == "GGA") return("G") # Gly else if (codon == "GGG") return("G") # Gly else return("*") }
	codon2aa <- c( "UUU" = "F", # Phe "UUC" = "F", # Phe "UUA" = "L", # Leu "UUG" = "L", # Leu "UCU" = "S", # Ser "UCC" = "S", # Ser "UCA" = "S", # Ser "UCG" = "S", # Ser "UAU" = "Y", # Tyr "UAC" = "Y", # Tyr "UAA" = "-", # stop "UAG" = "-", # stop "UGU" = "C", # Cys "UGC" = "C", # Cys "UGA" = "-", # stop "UGG" = "W", # Trp "CUU" = "L", # Leu "CUC" = "L", # Leu "CUA" = "L", # Leu "CUG" = "L", # Leu "CCU" = "P", # Pro "CCC" = "P", # Pro "CCA" = "P", # Pro "CCG" = "P", # Pro "CAU" = "H", # His "CAC" = "H", # His "CAA" = "Q", # Gln "CAG" = "Q", # Gln "CGU" = "R", # Arg "CGC" = "R", # Arg "CGA" = "R", # Arg "CGG" = "R", # Arg "AUU" = "I", # Ile "AUC" = "I", # Ile "AUA" = "I", # Ile "AUG" = "M", # Met "ACU" = "T", # Thr "ACC" = "T", # Thr "ACA" = "T", # Thr "ACG" = "T", # Thr "AAU" = "N", # Asn "AAC" = "N", # Asn "AAA" = "K", # Lys "AAG" = "K", # Lys "AGU" = "S", # Ser "AGC" = "S", # Ser "AGA" = "R", # Arg "AGG" = "R", # Arg "GUU" = "V", # Val "GUC" = "V", # Val "GUA" = "V", # Val "GUG" = "V", # Val "GCU" = "A", # Ala "GCC" = "A", # Ala "GCA" = "A", # Ala "GCG" = "A", # Ala "GAU" = "D", # Asp "GAC" = "D", # Asp "GAA" = "E", # Glu "GAG" = "E", # Glu "GGU" = "G", # Gly "GGC" = "G", # Gly "GGA" = "G", # Gly "GGG" = "G" # Gly )
	rna.to.protein <- function (rna) { protein <- "" i <- 1 while (i < nchar(rna)-1 && substr(rna,i,i+2) != "AUG") { # start codon i <- i+1 } i <- i+3 # skip the start codon while (i < nchar(rna)-1 && substr(rna,i,i+2) != "UAA" && substr(rna,i,i+2) != "UAG" && substr(rna,i,i+2) != "UGA") { codon <- substr(rna,i,i+2) if (is.na(codon2aa[codon])) { aa <- "*" } else { aa <- codon2aa[codon] } protein <- paste(protein,aa,sep="") i <- i+3 } return(protein) }

Section B.2: Overview of R

reverse.complement <- function (seq)
  UseMethod("reverse.complement")

reverse.complement.dna <- function (seq) {
  rev <- paste(rev(unlist(strsplit(seq,split=""))),sep="",collapse="")
  chartr("ACGT","TGCA",rev)
}

reverse.complement.rna <- function (seq) {
  rev <- paste(rev(unlist(strsplit(seq,split=""))),sep="",collapse="")
  chartr("ACGU","UGCA",rev)
}

Section B.3: R Quick Reference Card

Gabriel Valiente valiente@ (lsi.upc.edu)