Gabriel Valiente

Combinatorial Pattern Matching Algorithms in Computational Biology using Perl and R


Here is the source code for Combinatorial Pattern Matching Algorithms in Computational Biology using Perl and R, by Gabriel Valiente. It has been tested using Perl version 5.8.6 (BioPerl version 1.5) and R version 2.7.1 (seqinr version 1.1-7 and APE version 2.1-3).


Chapter 1: Introduction

Section 1.1: Combinatorial Pattern Matching

Section 1.2: Computational Biology

Section 1.3: A Motivating Example: Gene Prediction

sub extract_open_reading_frames {
  my $seq = shift;
  for my $r (0,1,2) {
    for (my $i = $r; $i <= length($seq)-3; $i += 3) {
      if (substr($seq,$i,3) eq "ATG") {
        my $j = $i+3;
        while ($j <= length($seq)-3 &&
            substr($seq,$j,3) ne "TAA" &&
            substr($seq,$j,3) ne "TAG" &&
            substr($seq,$j,3) ne "TGA") {
          $j += 3;
        }
        if ($j <= length($seq)-3) {
          my $len = $j+2-$i+1;
          if ($len >= 30) {
            print substr($seq,$i,$j+2-$i+1),"\n";
          }
        }
      }
    }
  }
}
extract.open.reading.frames <- function (seq) {
  for (i in 1:3) {
    while (i+2 <= nchar(seq)) {
      if (substr(seq,i,i+2) == "ATG") {
        j <- i + 3
        while (j+2 <= nchar(seq) &&
            substr(seq,j,j+2) != "TAA" &&
            substr(seq,j,j+2) != "TAG" &&
            substr(seq,j,j+2) != "TGA") {
          j <- j + 3
        }
        if (j+2 <= nchar(seq)) {
          if (j+2-i+1 >= 30) {
            print(c(i,j+2,substr(seq,i,j+2)))
          }
        }
      }
      i <- i + 3
    }
  }
}
sub longest_open_reading_frame {
  my $seq = shift;
  my ($ii,$jj) = (0,0);
  for my $r (0,1,2) {
    for (my $i = $r; $i <= length($seq)-3; $i += 3) {
      if (substr($seq,$i,3) eq "ATG") {
        my $j = $i+3;
        while ($j <= length($seq)-3 &&
            substr($seq,$j,3) ne "TAA" &&
            substr($seq,$j,3) ne "TAG" &&
            substr($seq,$j,3) ne "TGA") {
          $j += 3;
        }
        if ($j <= length($seq)-3) {
          my $len = $j+2-$i+1;
          if ($j+2-$i+1 > $jj+2-$ii+1) {
            $ii = $i;
            $jj = $j;
          }
        }
      }
    }
  }
  return [$ii,$jj+2];
}
longest.open.reading.frame <- function (seq) {
  ii <- jj <- 0
  for (i in 1:3) {
    while (i+2 <= nchar(seq)) {
      if (substr(seq,i,i+2) == "ATG") {
        j <- i + 3
        while (j+2 <= nchar(seq) &&
            substr(seq,j,j+2) != "TAA" &&
            substr(seq,j,j+2) != "TAG" &&
            substr(seq,j,j+2) != "TGA") {
          j <- j + 3
        }
        if (j+2 <= nchar(seq)) {
          if (j+2-i+1 > jj+2-ii+1) {
            ii <- i
            jj <- j
          }
        }
      }
      i <- i + 3
    }
  }
  c(ii,jj+2)
}

Chapter 2: Sequences

Section 2.1: Sequences in Mathematics

Section 2.1.1: Counting Labeled Sequences

Section 2.2: Sequences in Computer Science

Section 2.2.1: Traversing Labeled Sequences
sub seq_to_labeled_seq {
  my $seq = shift;
  my %seq;
  for (my $i = 0; $i < length($seq); $i++) {
    my $elem = substr($seq,$i,1);
    $seq{$elem}++;
  }
  return \%seq;
}
seq.to.labeled.seq <- function (seq) {
  alphabet <- sort(unique(strsplit(seq,"")[[1]]))
  lab <- matrix(0,nrow=length(alphabet),ncol=1)
  dimnames(lab) <- list(alphabet,"count")
  for (i in 1:nchar(seq))
    lab[substr(seq,i,i),] <-
      lab[substr(seq,i,i),] + 1
  lab
}
sub symmetric_difference {
  my $seq1 = shift;
  my $seq2 = shift;

  my %seq1 = %{ seq_to_labeled_seq($seq1) };
  my %seq2 = %{ seq_to_labeled_seq($seq2) };

  for my $elem (keys %seq2) {
    $seq1{$elem} = abs($seq1{$elem}-$seq2{$elem});
  }

  return \%seq1;
}
symmetric.difference <- function (seq1,seq2) {
    lab1 <- seq.to.labeled.seq(seq1)
    lab2 <- seq.to.labeled.seq(seq2)
    for (elem in row.names(lab2))
      lab1[elem,] <- abs(lab1[elem,] - lab2[elem,])
    lab1
}

Section 2.3: Sequences in Computational Biology

Section 2.3.1: Reverse Complementing DNA Sequences
sub reverse_complement {
  my $seq = shift;
  $seq = reverse $seq;
  $seq =~ tr/ACGT/TGCA/;
  return $seq;
}
reverse.complement <- function (seq) {
  rev <- paste(rev(unlist(strsplit(seq,split=""))),sep="",collapse="")
  chartr("ACGT","TGCA",rev)
}
Section 2.3.2: Counting RNA Sequences
sub count {
  my $n = shift;
  my @R;
  $R[0] = $R[1] = $R[2] = 1;
  for (my $i = 2; $i < $n; $i++) {
    $R[$i+1] = $R[$i];
    for (my $j = 1; $j < $i; $j++) {
      $R[$i+1] += $R[$j-1] * $R[$i-$j]
    }
  }
  return $R[$n];
}
count <- function (n) {
  R <- rep(0,n)
  R[1:3] <- 1
  if (n > 2) {
    for (i in 2:(n-1)) {
      R[i+2] <- R[i+1]
      for (j in 1:(i-1)) {
        R[i+2] <- R[i+2] + R[j] * R[i-j+1]
      }
    }
  }
  R[n+1]
}
Section 2.3.3: Generating DNA Sequences
sub words {
  my $n = shift;
  my $alphabet = shift;
  my @n = split "", $alphabet;
  if ($n == 1) {
    return \@n;
  } else {
    my $short = words($n - 1, $alphabet);
    my @long;
    for my $seq (@$short) {
      for my $n (@n) {
        push @long, $seq . $n;
      }
    }
    return \@long;
  }
}
library(seqinr)

words(length=3,alphabet=s2c("ACGT"))
Section 2.3.4: Representing Sequences in Perl
Section 2.3.5: Representing Sequences in R

Chapter 3: Simple Pattern Matching in Sequences

Section 3.1: Finding Words in Sequences

Section 3.1.1: Word Composition of Sequences
sub word_composition {
  my $seq = shift;
  my $k = shift;
  my $alphabet = shift;

  my $words = words($k,$alphabet);

  my %freq;
  for my $word (@$words) {
    $freq{$word} = 0;
  }

  for my $i (0 .. length($seq)-$k) {
    my $word = substr($seq,$i,$k);
    $freq{$word}++;
  }

  return \%freq;
}
library(seqinr)

count(s2c(seq),word=3,alphabet=s2c("ACGT"))
Section 3.1.2: Alignment Free Comparison of Sequences

library(seqinr)

alignment.free.distance <- function (seq1,seq2,k,sigma) {
  freq1 <- count(seq1,word=k,alphabet=s2c(sigma))
  freq2 <- count(seq2,word=k,alphabet=s2c(sigma))
  cor(freq1,freq2)
}
sub mean {
  my $a = shift;
  my $res;
  foreach (@$a) { $res += $_ }
  $res /= @$a;
}

sub sd {
  my $a = shift;
  my $mean = mean($a);
  return sqrt( mean( [map $_ ** 2, @$a] ) -
    ($mean ** 2) );
}

sub cov {
  my $a1 = shift;
  my $a2 = shift;
  my $res;
  foreach (0 .. @$a1 - 1) {
    $res += $a1->[$_] * $a2->[$_];
  }
  $res /= @$a1;
  $res -= mean($a1) * mean($a2);
}

sub cor {
  my $a1 = shift;
  my $a2 = shift;
  return cov($a1,$a2) / (sd($a1) * sd($a2));
}

Chapter 4: General Pattern Matching in Sequences

Section 4.1: Finding Subsequences

sub occurrences {
  my $p = shift;
  my $t = shift;
  my $m = length $p;
  my $n = length $t;
  my @L;
  for (my $i = 0; $i < $n-$m+1; $i++) {
    if ($p eq substr($t,$i,$m)) {
      push @L, $i;
    }
  }
  return \@L;
}
occurrences <- function (p,t) {
  L <- c()
  for (i in 1:nchar(t)-nchar(p)+1)
    if (p == substr(t,i,i+nchar(p)-1))
      L <- c(L,i)
  L
}

Section 4.1.1: Suffix Arrays

sub suffix_array {
  my $s = shift;
  my @sa = (0 .. length($s)-1);
  @sa = sort {substr($s,$a) cmp substr($s,$b)} @sa;
  return \@sa;
}
suffix.array <- function (seq) {
  sa <- apply(matrix(1:nchar(seq)),1,function(i)substr(seq,i,nchar(seq)))
  order(sa)
}
sub occurrences {
  my $p = shift;
  my $t = shift;
  my $sa = shift;
  my ($l, $r) = (0, $#sa);
  while ($l <= $r) {
    my $i = int(($l + $r)/2);
    my $c = $p cmp substr($t, $sa[$i], length $p);
    if ($c < 0) {
      $r = $i - 1;
    } elsif ($c > 0) {
      $l = $i + 1;
    } else {
      my $j = $i;
      while ($i > 0 &&
          $p eq substr($t, $sa[$i-1], length $p)) {
        $i--;
      }
      while ($j < $#sa-1 &&
          $p eq substr($t, $sa[$j+1], length $p)) {
        $j++;
      }
      return ($i, $j);
    }
  }
  return (-1, -1);
}
occurrences <- function (p,t,sa) {
  ell <- 1
  r <- nchar(t)
  while (ell <= r) {
    i <- as.integer((ell + r) / 2)
    s <- substr(t,sa[i],sa[i]+nchar(p)-1)
    if (p < s) {
      r <- i - 1
    } else if (p > s) {
      ell <- i + 1
    } else {
      j <- i
      while (i > 1 &&
          p==substr(t,sa[i-1],sa[i-1]+nchar(p)-1)) {
        i <- i - 1
      }
      while (j < nchar(t) &&
          p==substr(t,sa[j+1],sa[j+1]+nchar(p)-1)) {
        j <- j + 1
      }
      return(c(i,j))
    }
  }
  return(c(-1,-1))
}
sub lcp {
  my ($p, $i, $t, $j) = @_;
  my $c = 0;
  while ($i+$c < length $p && $j+$c < length $t &&
      substr($p,$i+$c,1) eq substr($t,$j+$c,1)) {
    $c++;
  }
  return $c;
}
lcp <- function (seq1,pos1,seq2,pos2) {
  n1 <- nchar(seq1)
  n2 <- nchar(seq2)
  c <- 0
  while (pos1+c <= n1 && pos2+c <= n2 &&
      substr(seq1,pos1+c,pos1+c) ==
      substr(seq2,pos2+c,pos2+c)) {
    c <- c + 1
  }
  c
}
sub occurrences {
  my $p = shift;
  my $t = shift;
  my $sa = shift;
  my ($l, $r) = (0, $#sa);
  my ($llcp, $rlcp) = (0, 0);
  while ($l <= $r) {
    my $i = int(($l + $r)/2);
    my $h = $llcp; $h = $rlcp if $rlcp < $h;
    my $c = lcp($p,$h,$t,$sa[$i]+$h);
    if ($h+$c == length $p) {
      my $j = $i;
      while ($i > 0 &&
          $p eq substr($t, $sa[$i-1], length $p)) {
        $i--;
      }
      while ($j < $#sa-1 &&
          $p eq substr($t, $sa[$j+1], length $p)) {
        $j++;
      }
      return ($i, $j);
    } elsif (substr($p,$h+$c,1) lt
        substr($t,$sa[$i]+$h+$c,1)) {
      $r = $i - 1;
      $rlcp = $h + $c;
    } else {
      $l = $i + 1;
      $llcp = $h + $c;
    }
  }
  return (-1, -1);
}
occurrences <- function (p,t,sa) {
  ell <- 1
  r <- nchar(t)
  llcp <- 0
  rlcp <- 0
  while (ell <= r) {
    i <- as.integer((ell + r) / 2)
    h <- min(llcp,rlcp)
    c <- lcp(p,h+1,t,sa[i]+h)
    if (h+c == nchar(p)) {
      j <- i
      while (i > 1 &&
          p==substr(t,sa[i-1],sa[i-1]+nchar(p)-1)) {
        i <- i - 1
      }
      while (j < nchar(t) &&
          p==substr(t,sa[j+1],sa[j+1]+nchar(p)-1)) {
        j <- j + 1
      }
      return(c(i,j))
    } else if (substr(p,h+c+1,h+c+1) <
        substr(t,sa[i]+h+c,sa[i]+h+c)) {
      r <- i - 1
      rlcp <- h + c
    } else {
      ell <- i + 1
      llcp <- h + c
    }
  }
  return(c(-1,-1))
}

Section 4.2: Finding Common Subsequences

sub common_occurrences {
  my $p = shift;
  my $t1 = shift;
  my $t2 = shift;
  my $m = length $p;
  my $n1 = length $t1;
  my $n2 = length $t2;
  my @L;
  for (my $i = 0; $i < $n1-$m+1; $i++) {
    if ($p eq substr($t1,$i,$m)) {
      for (my $j = 0; $j < $n2-$m+1; $j++) {
        if ($p eq substr($t2,$j,$m)) {
          push @L, $i, $j;
        }
      }
    }
  }
  return \@L;
}
common.occurrences <- function (p,t1,t2) {
  L <- c()
  for (i in 1:nchar(t1)-nchar(p)+1)
    if (p == substr(t1,i,i+nchar(p)-1))
      for (j in 1:nchar(t2)-nchar(p)+1)
        if (p == substr(t2,j,j+nchar(p)-1))
          L <- c(L,i,j)
  L
}
sub longest_common_subsequences {
  my $s1 = shift;
  my $s2 = shift;
  my $n1 = length($s1);
  my $n2 = length($s2);
  my @L;
  my $l = 0;
  for (my $i = 0; $i < $n1; $i++) {
    for (my $j = $i; $j < $n1; $j++) {
      my $seq = substr($s1,$i,$j-$i+1);
      for (my $k = 0; $k < $n2-$j+$i; $k++) {
        if ($seq eq substr($s2,$k,$j-$i+1)) {
          if (length($seq) == $l) {
            push @L, $seq;
          } elsif (length($seq) > $l) {
            @L = $seq;
            $l = length($seq);
          }
        }
      }
    }
  }
  my %count;
  for my $seq (@L) { $count{$seq}++; }
  @L = sort keys %count;
  return \@L;
}
longest.common.subsequences <- function (s1,s2) {
  L <- c()
  ell <- 0
  for (i in 1:nchar(s1)) {
    for (j in i:nchar(s1)) {
      seq <- substr(s1,i,j)
      for (k in 1:(nchar(s2)-j+i)) {
        if (seq == substr(s2,k,k+j-i)) {
          if (nchar(seq) == ell) {
            L <- c(L,seq)
          } else if (nchar(seq) > ell) {
            L <- c(seq)
            ell <- nchar(seq)
          }
        }
      }
    }
  }
  unique(L)
}
sub longest_common_subsequences {
  my $s1 = shift;
  my $s2 = shift;
  my @L;
  my $l = 0;
  my @LCS;
  for (my $i = 0; $i < length $s1; $i++ ) {
    for (my $j = 0; $j < length $s2; $j++ ) {
      $LCS[$i][$j] = 0;
      if (substr($s1,$i,1) eq substr($s2,$j,1)) {
        if ($i == 0 || $j == 0) {
          $LCS[$i][$j] = 1;
        } else {
          $LCS[$i][$j] = $LCS[$i-1][$j-1] + 1;
        }
        if ($LCS[$i][$j] > $l) {
          $l = $LCS[$i][$j];
          @L = ();
        }
        if ($LCS[$i][$j] == $l) {
          push @L, substr($s1,$i-$l+1,$l);
        }
      }
    }
  }
  return @L;
}
longest.common.subsequences <- function (s1,s2) {
  L <- c()
  ell <- 0
  LCS <- matrix(0,nrow=nchar(s1),ncol=nchar(s2))
  for (i in 1:nchar(s1)) {
    for (j in 1:nchar(s2)) {
      if (substr(s1,i,i) == substr(s2,j,j)) {
        if (i == 1 || j == 1) {
          LCS[i,j] <- 1
        } else {
          LCS[i,j] <- LCS[i-1,j-1] + 1
        }
        if (LCS[i,j] > ell) {
          ell <- LCS[i,j]
          L <- c()
        }
        if (LCS[i,j] == ell) {
          L <- c(L,substr(s1,i-ell+1,i))
        }
      }
    }
  }
  L
}

Section 4.2.1: Generalized Suffix Arrays

sub generalized_suffix_array {
  my $seq = shift;
  my %seq = %{$seq};

  my @gsa;
  for my $id (keys %seq) {
    for my $i (0 .. length($seq{$id})-1) {
      push @gsa, [$i, $id];
    }
  }

  @gsa = sort {substr($seq{@{$a}[1]}, @{$a}[0])
    cmp substr($seq{@{$b}[1]}, @{$b}[0])} @gsa;

  return \@gsa;
}
generalized.suffix.array <- function (s1,s2) {
  sa1 <- apply(matrix(1:nchar(s1)),1,
    function (i) substr(s1,i,nchar(s1)))
  sa2 <- apply(matrix(1:nchar(s2)),1,
    function (i) substr(s2,i,nchar(s2)))
  arr <- order(c(sa1,sa2))
  tag <- sapply(arr,function (i)
    if (i <= nchar(s1)) { 1 } else { 2 } )
  arr <- sapply(arr,function (i)
    if (i <= nchar(s1)) { i } else { i-nchar(s1) } )
  cbind(arr,tag)
}
sub common_occurrences {
  my $p = shift;
  my $seq = shift;
  my %seq = %{$seq};
  my $gsa = shift;
  my @gsa = @{$gsa};
  my ($l, $r) = (0, $#gsa);
  while ($l <= $r) {
    my $i = int(($l + $r)/2);
    my $c = $p cmp substr($seq{$gsa[$i][1]}, $gsa[$i][0], length $p);
    if ($c < 0) {
      $r = $i - 1;
    } elsif ($c > 0) {
      $l = $i + 1;
    } else {
      my $j = $i;
      while ($i > 0 &&
          $p eq substr($seq{$gsa[$i-1][1]}, $gsa[$i-1][0], length $p)) {
        $i--;
      }
      while ($j < $#gsa &&
          $p eq substr($seq{$gsa[$j+1][1]}, $gsa[$j+1][0], length $p)) {
        $j++;
      }
      return ($i, $j);
    }
  }
  return (-1, -1);
}
common.occurrences <- function (p,seq,gsa) {
  ell <- 1
  r <- n <- nchar(seq[1]) + nchar(seq[2])
  while (ell <= r) {
    i <- as.integer((ell + r) / 2)
    s <- substr(seq[gsa[i,2]],
      gsa[i,1],gsa[i,1]+nchar(p)-1)
    if (p < s) {
      r <- i - 1
    } else if (p > s) {
      ell <- i + 1
    } else {
      j <- i
      while (i > 1 && p == s) {
        s <- substr(seq[gsa[i-1,2]],
          gsa[i-1,1],gsa[i-1,1]+nchar(p)-1)
        if (p == s) i <- i - 1
      }
      s <- p
      while (j < n && p == s) {
        s <- substr(seq[gsa[j+1,2]],
          gsa[j+1,1],gsa[j+1,1]+nchar(p)-1)
        if (p == s) j <- j + 1
      }
      return(c(i,j))
    }
  }
  return(c(-1,-1))
}
sub common_occurrences {
  my $p = shift;
  my $seq = shift;
  my %seq = %{$seq};
  my $gsa = shift;
  my @gsa = @{$gsa};
  my ($l, $r) = (0, $#gsa);
  my ($llcp, $rlcp) = (0, 0);
  while ($l <= $r) {
    my $i = int(($l + $r)/2);
    my $h = $llcp; $h = $rlcp if $rlcp < $h;
    my $c = lcp($p,$h,$seq{$gsa[$i][1]},$gsa[$i][0]+$h);
    if ($h+$c == length $p) {
      my $j = $i;
      while ($i > 0 &&
          $p eq substr($seq{$gsa[$i-1][1]}, $gsa[$i-1][0], length $p)) {
        $i--;
      }
      while ($j < $#gsa-1 &&
          $p eq substr($seq{$gsa[$j+1][1]}, $gsa[$j+1][0], length $p)) {
        $j++;
      }
      return ($i, $j);
    } elsif (substr($p,$h+$c,1) lt
        substr($seq{$gsa[$i][1]},$gsa[$i][0]+$h+$c,1)) {
      $r = $i - 1;
      $rlcp = $h + $c;
    } else {
      $l = $i + 1;
      $llcp = $h + $c;
    }
  }
  return (-1, -1);
}
common.occurrences <- function (p,seq,gsa) {
  ell <- 1
  r <- n <- nchar(seq[1]) + nchar(seq[2])
  llcp <- 0
  rlcp <- 0
  while (ell <= r) {
    i <- as.integer((ell + r) / 2)
    s <- substr(seq[gsa[i,2]],gsa[i,1],
      gsa[i,1]+nchar(p)-1)
    h <- min(llcp,rlcp)
    c <- lcp(p,h+1,s,1+h)
    if (h+c == nchar(p)) {
      j <- i
      while (i > 1 && p == s) {
        s <- substr(seq[gsa[i-1,2]],gsa[i-1,1],
          gsa[i-1,1]+nchar(p)-1)
        if (p == s) i <- i - 1
      }
      s <- p
      while (j < n && p == s) {
        s <- substr(seq[gsa[j+1,2]],gsa[j+1,1],
          gsa[j+1,1]+nchar(p)-1)
        if (p == s) j <- j + 1
      }
      return(c(i,j))
    } else {
      s <- substr(seq[gsa[i,2]],gsa[i,1]+h+c,
        gsa[i,1]+h+c)
      if (substr(p,h+c+1,h+c+1) < s) {
        r <- i - 1
        rlcp <- h + c
      } else {
        ell <- i + 1
        llcp <- h + c
      }
    }
  }
  return(c(-1,-1))
}
sub longest_common_subsequences {
  my $seq = shift;
  my %seq = %{$seq};
  my $gsa = shift;
  my @gsa = @{$gsa};
  my @L;
  my $l = 0;
  my $f2 = substr($seq{$gsa[0][1]}, $gsa[0][0], length($seq{$gsa[0][1]}));
  for (my $i = 1; $i < length($seq{1})+length($seq{2}); $i++) {
    my $f1 = $f2;
    $f2 = substr($seq{$gsa[$i][1]}, $gsa[$i][0], length($seq{$gsa[$i][1]}));
    if ($gsa[$i][1] != $gsa[$i-1][1]) {
      my $p = lcp($f1,0,$f2,0);
      if ($p > $l) {
        $l = $p;
        @L = ();
      }
      if ($p == $l) {
        push @L,
          substr($seq{$gsa[$i][1]},$gsa[$i][0],$p);
      }
    }
  }
  return \@L;
}
longest.common.subsequences <- function (seq,gsa) {
  L <- c()
  ell <- 0
  suf2 <- substr(seq[gsa[1,2]],gsa[1,1],
    nchar(seq[gsa[1,2]]))
  for (i in 2:(nrow(gsa))) {
    suf1 <- suf2
    suf2 <- substr(seq[gsa[i,2]],gsa[i,1],
      nchar(seq[gsa[i,2]]))
    if (gsa[i-1,2] != gsa[i,2]) {
      pref <- substr(suf1,1,lcp(suf1,1,suf2,1))
      if (nchar(pref) > ell) {
        ell <- nchar(pref)
        L <- c()
      }
      if (nchar(pref) == ell) {
        L <- c(L,pref)
      }
    }
  }
  L
}

Section 4.3: Comparing Sequences

Section 4.3.1: Edit Distance Based Comparison of Sequences

sub hamming_distance {
  my $s1 = shift;
  my $s2 = shift;
  my $d = -1;
  if (length $s1 == length $s2) {
    $d = 0;
    for (my $i = 0; $i < length $s1; $i++) {
      $d++ if substr($s1,$i,1) ne substr($s2,$i,1);
    }
  }
  return $d;
}
hamming.distance <- function (s1,s2) {
  if (nchar(s1) != nchar(s2))
    -1
  else
    sum(strsplit(s1,"")[[1]] != strsplit(s2,"")[[1]])
}
sub levenshtein_distance {
  my $s1 = shift;
  my $s2 = shift;
  my ($n1, $n2) = (length $s1, length $s2);

  my @d;
  for ( my $i = 0; $i <= $n1; $i++ ) {
    $d[$i][0] = $i;
  }
  for ( my $j = 0; $j <= $n2; $j++ ) {
    $d[0][$j] = $j;
  }

  my @t1 = split //, $s1;
  my @t2 = split //, $s2;

  for ( my $i = 1; $i <= $n1; $i++ ) {
    for ( my $j = 1; $j <= $n2; $j++ ) {
      if ($t1[$i-1] eq $t2[$j-1]) {
        $d[$i][$j] = $d[$i-1][$j-1];
      } else {
        $d[$i][$j] = $d[$i-1][$j] + 1;
        if ($d[$i][$j-1] + 1 < $d[$i][$j]) {
          $d[$i][$j] = $d[$i][$j-1] + 1;
        }
      }
    }
  }

  return \@d;
}
levenshtein.distance <- function(s1,s2) {
  t1 <- strsplit(s1,split="")[[1]]
  t2 <- strsplit(s2,split="")[[1]]
  n1 <- length(t1)
  n2 <- length(t2)
  d <- array(0,dim=c(n1+1,n2+1))
  d[,1] <- 0:n1
  d[1,] <- 0:n2
  for (i in 2:(n1+1))
    for (j in 2:(n2+1))
      if (t1[i-1] == t2[j-1])
        d[i,j] <- d[i-1,j-1]
      else
        d[i,j] <- min(d[i-1,j]+1, d[i,j-1]+1)
  d
}
sub edit_distance {
  my $s1 = shift;
  my $s2 = shift;
  my ($n1, $n2) = (length $s1, length $s2);
  my @d;
  for ( my $i = 0; $i <= $n1; $i++ ) {
    $d[$i][0] = $i;
  }
  for ( my $j = 0; $j <= $n2; $j++ ) {
    $d[0][$j] = $j;
  }

  my @t1 = split //, $s1;
  my @t2 = split //, $s2;

  for ( my $i = 1; $i <= $n1; $i++ ) {
    for ( my $j = 1; $j <= $n2; $j++ ) {
      $d[$i][$j] = $d[$i-1][$j] + 1;
      if ($d[$i][$j-1] + 1 < $d[$i][$j]) {
        $d[$i][$j] = $d[$i][$j-1] + 1;
      }
      if ($t1[$i-1] eq $t2[$j-1]) {
        if ($d[$i-1][$j-1] < $d[$i][$j]) {
          $d[$i][$j] = $d[$i-1][$j-1];
        }
      } else {
        if ($d[$i-1][$j-1] + 1 < $d[$i][$j]) {
          $d[$i][$j] = $d[$i-1][$j-1] + 1;
        }
      }
    }
  }

  return \@d;
}
edit.distance <- function (s1,s2) {
  t1 <- strsplit(s1,split="")[[1]]
  t2 <- strsplit(s2,split="")[[1]]
  n1 <- length(t1)
  n2 <- length(t2)
  d <- array(0,dim=c(n1+1,n2+1))
  d[,1] <- 0:n1
  d[1,] <- 0:n2
  for (i in 2:(n1+1)) {
    for (j in 2:(n2+1)) {
      d[i,j] <- min(d[i-1,j]+1, d[i,j-1]+1)
      if (t1[i-1] == t2[j-1])
        d[i,j] <- min(d[i,j], d[i-1,j-1])
      else
        d[i,j] <- min(d[i,j], d[i-1,j-1]+1)
    }
  }
  d
}

Section 4.3.2: Alignment Based Comparison of Sequences

sub alignment {
  my $s1 = shift;
  my $s2 = shift;
  my $d = shift;
  my @d = @{$d};
  my ($t1, $t2) = ("", "");
  my @t1 = split //, $s1;
  my @t2 = split //, $s2;
  my ($n1, $n2) = (length $s1, length $s2);
  my ($i, $j) = ($n1, $n2);
  while ($i > 0 && $j > 0) {
    if ($d[$i][$j] eq $d[$i][$j-1]+1) {
      $t1 = "-" . $t1;
      $t2 = $t2[$j-1] . $t2;
      $j--;
    } elsif ($d[$i][$j] eq $d[$i-1][$j]+1) {
      $t1 = $t1[$i-1] . $t1;
      $t2 = "-" . $t2;
      $i--;
    } else {
      $t1 = $t1[$i-1] . $t1;
      $t2 = $t2[$j-1] . $t2;
      $i--;
      $j--;
    }
  }
  while ($j > 0) {
    $t1 = "-" . $t1;
    $t2 = $t2[$j-1] . $t2;
    $j--;
  }
  while ($i > 0) {
    $t1 = $t1[$i-1] . $t1;
    $t2 = "-" . $t2;
    $i--;
  }
  print "$t1\n$t2\n";
}
alignment <- function (s1,s2,d) {
  t1 <- strsplit(s1,split="")[[1]]
  t2 <- strsplit(s2,split="")[[1]]
  a1 <- a2 <- c()
  i <- length(t1)+1
  j <- length(t2)+1
  while (i > 1 && j > 1) {
    if (d[i,j] == d[i,j-1]+1) {
      a1 <- c("-",a1)
      a2 <- c(t2[j-1],a2)
      j <- j-1
    } else if (d[i,j] == d[i-1,j]+1) {
      a1 <- c(t1[i-1],a1)
      a2 <- c("-",a2)
      i <- i-1
    } else {
      a1 <- c(t1[i-1],a1)
      a2 <- c(t2[j-1],a2)
      i <- i-1
      j <- j-1
    }
  }
  while (j > 1) {
    a1 <- c("-",a1)
    a2 <- c(t2[j-1],a2)
    j <- j-1
  }
  while (i > 1) {
    a1 <- c(t1[i-1],a1)
    a2 <- c("-",a2)
    i <- i-1
  }
  print(paste(a1,collapse=""))
  print(paste(a2,collapse=""))
}
sub local_alignment {
  my ($s1, $s2, $match, $mismatch, $gap) = @_;
  my ($n1, $n2) = ( length $s1, length $s2);

  my @s;
  for (my $i = 0; $i <= $n1; $i++) { $s[$i][0] = 0; }
  for (my $j = 0; $j <= $n2; $j++) { $s[0][$j] = 0; }

  my @t1 = split //, $s1;
  my @t2 = split //, $s2;

  for ( my $i = 1; $i <= $n1; $i++ ) {
    for ( my $j = 1; $j <= $n2; $j++ ) {
      if ($t1[$i-1] eq $t2[$j-1]) {
        $s[$i][$j] = $s[$i-1][$j-1] + $match;
      } else {
        $s[$i][$j] = $s[$i-1][$j-1] + $mismatch;
      }
      $s[$i][$j] = $s[$i-1][$j] + $gap
        if $s[$i-1][$j] + $gap > $s[$i][$j];
      $s[$i][$j] = $s[$i][$j-1] + $gap
        if $s[$i][$j-1] + $gap > $s[$i][$j];
      $s[$i][$j] = 0 if 0 > $s[$i][$j];
    }
  }

  my $max = 0;
  my @opt;
  for ( my $i = 1; $i <= $n1; $i++ ) {
    for ( my $j = 1; $j <= $n2; $j++ ) {
      if ($s[$i][$j] > $max) {
        $max = $s[$i][$j];
        @opt = ();
      }
      if ($s[$i][$j] == $max) {
        $max = $s[$i][$j];
        push @opt, $i, $j;
      }
    }
  }

  while (@opt) {
    my $i = shift @opt;
    my $j = shift @opt;
    my ($t1, $t2) = ("", "");
    while ($s[$i][$j] != 0) {
      if (($t1[$i-1] eq $t2[$j-1] && $s[$i][$j] eq
          $s[$i-1][$j-1] + $match) || ($t1[$i-1] ne $t2[$j-1] &&
          $s[$i][$j] eq $s[$i-1][$j-1] + $mismatch)) {
        $t1 = $t1[$i-1] . $t1;
        $t2 = $t2[$j-1] . $t2;
        $i--;
        $j--;
      } elsif ($s[$i][$j] eq $s[$i-1][$j] + $gap) {
        $t1 = $t1[$i-1] . $t1;
        $t2 = "-" . $t2;
        $i--;
      } elsif ($s[$i][$j] eq $s[$i][$j-1] + $gap) {
        $t1 = "-" . $t1;
        $t2 = $t2[$j-1] . $t2;
        $j--;
      }
    }
    print "$t1\n$t2\n";
  }
}
local.alignment <- function
    (s1,s2,match,mismatch,gap) {
  t1 <- strsplit(s1,split="")[[1]]
  t2 <- strsplit(s2,split="")[[1]]
  n1 <- length(t1)
  n2 <- length(t2)
  s <- array(0,dim=c(n1+1,n2+1))
  s[,1] <- 0
  s[1,] <- 0
  for (i in 2:(n1+1)) {
    for (j in 2:(n2+1)) {
      if (t1[i-1] == t2[j-1])
        s[i,j] <- s[i-1,j-1] + match
      else
        s[i,j] <- s[i-1,j-1] + mismatch
      s[i,j] <- max(
        s[i,j],
        s[i-1,j] + gap,
        s[i,j-1] + gap,
        0)
    }
  }
  opt <- which(s==max(s),arr.ind=TRUE)
  for (sol in 1:nrow(opt)) {
    i <- opt[sol,1]
    j <- opt[sol,2]
    m1 <- c()
    m2 <- c()
    while (s[i,j] != 0) {
      if ((t1[i-1] == t2[j-1] && s[i,j] ==
            s[i-1,j-1] + match) ||
          (t1[i-1] != t2[j-1] && s[i,j] ==
            s[i-1,j-1] + mismatch)) {
        m1 <- c(t1[i-1],m1)
        m2 <- c(t2[j-1],m2)
        i <- i - 1
        j <- j - 1
      } else if (s[i,j] == s[i-1,j] + gap) {
        m1 <- c(t1[i-1],m1)
        m2 <- c("-",m2)
        i <- i - 1
      } else if (s[i,j] == s[i,j-1] + gap) {
        m1 <- c("-",m1)
        m2 <- c(t2[j-1],m2)
        j <- j - 1
      }
    }
    print(paste(m1,collapse=""))
    print(paste(m2,collapse=""))
  }
}

Chapter 5: Trees

Section 5.1: Trees in Mathematics

Section 5.1.1: Counting Labeled Trees

Section 5.2: Trees in Computer Science

Section 5.2.1: Traversing Rooted Trees

Section 5.3: Trees in Computational Biology

Section 5.3.1: The Newick Linear Representation
Section 5.3.2: Counting Phylogenetic Trees
Section 5.3.3: Generating Phylogenetic Trees
Section 5.3.4: Representing Trees in Perl
Section 5.3.5: Representing Trees in R

Chapter 6: Simple Pattern Matching in Trees

Section 6.1: Finding Paths in Unrooted Trees

Section 6.1.1: Distances in Unrooted Trees
Section 6.1.2: The Partition Distance between Unrooted Trees
sub partition {
  my $tree = shift;
  my %partition = ();
  for my $node ($tree->get_nodes) {
    next if $node->is_Leaf; # discard terminal nodes
    next unless $node->ancestor; # discard root
    my @a = sort { $a cmp $b } map { $_->id }
      grep {$_->is_Leaf } $node->get_all_Descendents;
    my $aa = join ",", @a;
    my @b = ();
    my %count = ();
    foreach my $label (@a, map { $_->id } $tree->get_leaf_nodes) {
      $count{$label}++
    }
    foreach my $label (keys %count) {
      push @b, $label if $count{$label} == 1;
    }
    my $bb = join ",", sort { $a cmp $b } @b;
    my $p = $aa le $bb ? "$aa:$bb" : "$bb:$aa";
    $partition{$p} = 1;
  }
  my @partition = sort keys %partition;
  return \@partition;
}

sub partition_distance {
  my $tree1 = shift;
  my $tree2 = shift;
  my @partition1 = @{ partition($tree1) };
  my @partition2 = @{ partition($tree2) };
  my $dist = 0;
  my %count = ();
  foreach my $p (@partition1, @partition2) {
    $count{$p}++
  }
  foreach my $p (keys %count) {
    $dist++ if $count{$p} == 1;
  }
  return $dist;
}
library(ape)

dist.topo(t1,t2)
Section 6.1.3: The Nodal Distance between Unrooted Trees
sub distances {
  my $tree = shift;
  my @leaves = sort {$a->id cmp $b->id} $tree->get_leaf_nodes;
  my @labels = map { $_->id } @leaves;
  my $n = scalar @labels;
  my @dist;
  for my $i (1..$n-1) {
    my @nodes = $tree->find_node(-id => $labels[$i-1]);
    for my $j ($i+1..$n) {
      push @nodes, $tree->find_node(-id => $labels[$j-1]);
      push @dist, $tree->distance(-nodes => \@nodes);
      pop @nodes;
    }
  }
  return \@dist;
}

sub nodal_distance {
  my $tree1 = shift;
  my $tree2 = shift;
  my @dist1 = @{ distances($tree1) };
  my @dist2 = @{ distances($tree2) };
  my ($dist, $d1, $d2);
  while (@dist1 and @dist2) {
    $d1 = pop @dist1;
    $d2 = pop @dist2;
    $dist += abs($d1-$d2);
  }
  return $dist;
}
library(ape)

nodal.distance <- function (t1,t2) {
    m1<-cophenetic.phylo(t1)
    m1<-m1[order(rownames(m1)),order(colnames(m1))]
    n1<-m1[upper.tri(m1)]
    m2<-cophenetic.phylo(t2)
    m2<-m2[order(rownames(m2)),order(colnames(m2))]
    n2<-m2[upper.tri(m2)]
    sum(abs(n1-n2))
}

Section 6.2: Finding Paths in Rooted Trees

Section 6.2.1: Distances in Rooted Trees
Section 6.2.2: The Partition Distance between Rooted Trees
Section 6.2.3: The Nodal Distance between Rooted Trees

Chapter 7: General Pattern Matching in Trees

Section 7.1: Finding Subtrees

Section 7.1.1: Finding Subtrees Induced by Triplets
sub triplet {
  my $tree = shift;
  my $i = shift;
  my $j = shift;
  my $k = shift;

  my @ij = grep { $_->id =~ /$i|$j/ } $tree->get_leaf_nodes;
  my @ik = grep { $_->id =~ /$i|$k/ } $tree->get_leaf_nodes;
  my @jk = grep { $_->id =~ /$j|$k/ } $tree->get_leaf_nodes;

  my $ij = $tree->get_lca(-nodes => \@ij);
  my $ik = $tree->get_lca(-nodes => \@ik);
  my $jk = $tree->get_lca(-nodes => \@jk);

  my $str;
  if ($ik == $jk) {
    $str = "(($i,$j),$k);";
  } else {
    if ($ij == $jk) {
      $str = "(($i,$k),$j);";
    } else {
      $str = "(($j,$k),$i);";
    }
  }

  return $str;
}
library(ape)

triplet <- function (t,i,j,k) {
    ij <- mrca(t)[i,j]
    ik <- mrca(t)[i,k]
    jk <- mrca(t)[j,k]
    if (ik == jk)
      paste("((",i,",",j,"),",k,");",sep="")
    else
      if (ij == jk)
        paste("((",i,",",k,"),",j,");",sep="")
      else
        paste("((",j,",",k,"),",i,");",sep="")
}
Section 7.1.2: Finding Subtrees Induced by Quartets
use Clone qw(clone);

sub quartet {
  my ($tree,$i,$j,$k,$l) = @_;
  my $copy = clone $tree;

  map { $copy->remove_Node($_) }
    grep { !($_->id =~ /$i|$j|$k|$l/) }
    $copy->get_leaf_nodes;

  return $copy;
}
library(ape)

quartet <- function (t,i,j,k,l) {
  unroot(drop.tip(t,setdiff(t$tip.label,c(i,j,k,l))))
}

Section 7.2: Finding Common Subtrees

Section 7.2.1: Maximum Agreement of Rooted Trees

parent <- function (tree,node)
  tree$edge[which(tree$edge[,2]==node),1]

children <- function (tree,node)
  tree$edge[which(tree$edge[,1]==node),2]

is.root <- function (tree,node)
  length(tree$tip.label)+1==node

is.ancestor <- function(tree,anc,des) {
  if (anc==des) return(TRUE)
  if (is.root(tree,des)) return(FALSE)
  else return(is.ancestor(tree,anc,parent(tree,des)))
}

postorder <- function (tree) {
  r <- length(tree$tip.label)+1
  postorder.traversal(tree,r,c())
}

postorder.traversal <- function (tree,v,res) {
  for (w in t1$edge[which(tree$edge[,1]==v),2])
    res <- postorder.traversal(tree,w,res)
  c(res,v)
}
sub mast_size {
  my $tree1 = shift;
  my $tree2 = shift;
  my (@m, $m);
  for my $node1 (@{ $tree1->Bio::Tree::Compatible::postorder_traversal }) {
    my $n1 = $node1->internal_id;
    for my $node2 (@{ $tree2->Bio::Tree::Compatible::postorder_traversal }) {
      my $n2 = $node2->internal_id;
      if ($node1->is_Leaf) {
        if ($node2->is_Leaf) {
          $m = ($node1->id eq $node2->id ? 1 : 0);
        } else {
          my $label1 = $node1->id;
          $m = (grep { /$label1/ } map { $_->id }
            grep { $_->is_Leaf } $node2->get_all_Descendents) ? 1 : 0;
        }
      } else {
        if ($node2->is_Leaf) {
          my $label2 = $node2->id;
          $m = (grep { /$label2/ } map { $_->id }
            grep { $_->is_Leaf } $node1->get_all_Descendents) ? 1 : 0;
        } else {
          my ($l1,$r1) = map { $_->internal_id } $node1->each_Descendent;
          my ($l2,$r2) = map { $_->internal_id } $node2->each_Descendent;
          $m = $m[$l1][$l2] + $m[$r1][$r2];
          if ($m[$l1][$r2] + $m[$r1][$l2] > $m) {
            $m = $m[$l1][$r2] + $m[$r1][$l2];
          }
          if ($m[$n1][$l2] > $m) {
            $m = $m[$n1][$l2];
          }
          if ($m[$n1][$r2] > $m) {
            $m = $m[$n1][$r2];
          }
          if ($m[$l1][$n2] > $m) {
            $m = $m[$l1][$n2];
          }
          if ($m[$r1][$n2] > $m) {
            $m = $m[$r1][$n2];
          }
        }
      }
      $m[$n1][$n2] = $m;
    }
  }
  return $m;
}
mast.size <- function (t1,t2) {
  po1 <- postorder(t1)
  po2 <- postorder(t2)
  m <- matrix(0,nrow=length(po1),ncol=length(po2),dimnames=list(po1,po2))
  for (i in po1) {
    for (j in po2) {
      if (length(t1$edge[which(t1$edge[,1]==i),1])==0) {
        if (length(t2$edge[which(t2$edge[,1]==j),1])==0) {
          if (t1$tip.label[i]==t2$tip.label[j]) m[i,j] <- 1
        } else {
          jj <- which(t2$tip.label==t1$tip.label[i])
          if (is.ancestor(t2,j,jj)) m[i,j] <- 1
        }
      } else {
        if (length(t2$edge[which(t2$edge[,1]==j),1])==0) {
          ii <- which(t1$tip.label==t2$tip.label[j])
          if (is.ancestor(t1,i,ii)) m[i,j] <- 1
        } else {
          l1 <- children(t1,i)[1]
          r1 <- children(t1,i)[2]
          l2 <- children(t2,j)[1]
          r2 <- children(t2,j)[2]
          m[i,j] <- max(m[l1,l2]+m[r1,r2],m[l1,r2]+m[r1,l2],
            m[i,l2],m[i,r2],m[l1,j],m[r1,j])
        }
      }
    }
  }
  m <- m[po1,po2]
  dimnames(m) <- list(po1,po2)
  m
}
sub mast {
  my $tree1 = shift;
  my $tree2 = shift;
  my @m;
  for my $node1 (@{ $tree1->Bio::Tree::Compatible::postorder_traversal }) {
    my $n1 = $node1->internal_id;
    for my $node2 (@{ $tree2->Bio::Tree::Compatible::postorder_traversal }) {
      my $n2 = $node2->internal_id;
      my @mm = ();
      if ($node1->is_Leaf) {
        if ($node2->is_Leaf) {
          if ($node1->id eq $node2->id) {
            @mm = ($node1->id);
          }
        } else {
          my $label1 = $node1->id;
          if (grep { /$label1/ } map { $_->id }
            grep { $_->is_Leaf } $node2->get_all_Descendents) {
            @mm = ($label1);
          }
        }
      } else {
        if ($node2->is_Leaf) {
          my $label2 = $node2->id;
          if (grep { /$label2/ } map { $_->id }
            grep { $_->is_Leaf } $node1->get_all_Descendents) {
            @mm = ($label2);
          }
        } else {
          my ($l1,$r1) = map { $_->internal_id } $node1->each_Descendent;
          my ($l2,$r2) = map { $_->internal_id } $node2->each_Descendent;
          @mm = @{$m[$l1][$l2]};
          push @mm, @{$m[$r1][$r2]};
          if (scalar(@{$m[$l1][$r2]})+scalar(@{$m[$r1][$l2]})>scalar(@mm)) {
            @mm = @{$m[$l1][$r2]};
            push @mm, @{$m[$r1][$l2]};
          }
          if (scalar(@{$m[$n1][$l2]})>scalar(@mm)) {
            @mm = @{$m[$n1][$l2]};
          }
          if (scalar(@{$m[$n1][$r2]})>scalar(@mm)) {
            @mm = @{$m[$n1][$r2]};
          }
          if (scalar(@{$m[$l1][$n2]})>scalar(@mm)) {
            @mm = @{$m[$l1][$n2]};
          }
          if (scalar(@{$m[$r1][$n2]})>scalar(@mm)) {
            @mm = @{$m[$r1][$n2]};
          }
        }
      }
      $m[$n1][$n2] = \@mm;
    }
  }
  return \@m;
}
mast <- function (t1,t2) {
  po1 <- postorder(t1)
  po2 <- postorder(t2)
  m <- matrix(rep(list(),length(po1)),nrow=length(po1),ncol=length(po2),
    dimnames=list(po1,po2))
  for (i in po1) {
    for (j in po2) {
      mm <- list()
      if (length(t1$edge[which(t1$edge[,1]==i),1])==0) {
        if (length(t2$edge[which(t2$edge[,1]==j),1])==0) {
          if (t1$tip.label[i]==t2$tip.label[j])
            mm <- c(t1$tip.label[i])
        } else {
          jj <- which(t2$tip.label==t1$tip.label[i])
          if (is.ancestor(t2,j,jj))
            mm <- c(t1$tip.label[i])
        }
      } else {
        if (length(t2$edge[which(t2$edge[,1]==j),1])==0) {
          ii <- which(t1$tip.label==t2$tip.label[j])
          if (is.ancestor(t1,i,ii))
            mm <- c(t2$tip.label[j])
        } else {
          l1 <- children(t1,i)[1]
          r1 <- children(t1,i)[2]
          l2 <- children(t2,j)[1]
          r2 <- children(t2,j)[2]
          mm <- c(m[[l1,l2]],m[[r1,r2]])
          if ( length(m[[l1,r2]]) + length(m[[r1,l2]]) > length(mm) )
            mm <- c(m[[l1,r2]],m[[r1,l2]])
          if ( length(m[[i,l2]]) > length(mm) )
            mm <- m[[i,l2]]
          if ( length(m[[i,r2]]) > length(mm) )
            mm <- m[[i,r2]]
          if ( length(m[[l1,j]]) > length(mm) )
            mm <- m[[l1,j]]
          if ( length(m[[r1,j]]) > length(mm) )
            mm <- m[[r1,j]]
        }
      }
      m[[i,j]] <- mm
    }
  }
  unlist(m[[length(t1$tip.label)+1,length(t2$tip.label)+1]])
}
Section 7.2.2: Maximum Agreement of Unrooted Trees

Section 7.3: Comparing Trees

Section 7.3.1: The Triplets Distance between Rooted Trees
sub triplets_distance {
  my $tree1 = shift;
  my $tree2 = shift;
  my @leaves = sort {$a->id cmp $b->id} $tree1->get_leaf_nodes;
  my @labels = map { $_->id } @leaves;
  my $dist = 0;
  for (my $i = 0; $i < @leaves; $i++) {
    for (my $j = $i+1; $j < @leaves; $j++) {
      for (my $k = $j+1; $k < @leaves; $k++) {
        my $t1 = triplet($tree1,$labels[$i],$labels[$j],$labels[$k]);
        my $t2 = triplet($tree2,$labels[$i],$labels[$j],$labels[$k]);
        $dist += 2 unless ($t1 eq $t2);
      }
    }
  }
  return $dist;
}
triplets.distance <- function (t1,t2) {
  L <- sort(t1$tip.label)
  d <- 0
  for (i in L[1:(length(L)-2)]) {
    for (j in L[(match(i,L)+1):(length(L)-1)]) {
      for (k in L[(match(j,L)+1):length(L)]) {
        str1 <- triplet(t1,i,j,k)
        str2 <- triplet(t2,i,j,k)
        if (str1 != str2) { d <- d + 2 }
      }
    }
  }
  d
}
Section 7.3.2: The Quartets Distance between Unrooted Trees
sub quartets_distance {
  my $tree1 = shift;
  my $tree2 = shift;
  my @leaves = sort {$a->id cmp $b->id} $tree1->get_leaf_nodes;
  my @labels = map { $_->id } @leaves;
  my $dist = 0;
  for (my $i = 0; $i < @leaves; $i++) {
    for (my $j = $i+1; $j < @leaves; $j++) {
      for (my $k = $j+1; $k < @leaves; $k++) {
        for (my $l = $k+1; $l < @leaves; $l++) {
          my $q1 = quartet($tree1,
            $labels[$i],$labels[$j],$labels[$k],$labels[$l]);
          my $q2 = quartet($tree2,
            $labels[$i],$labels[$j],$labels[$k],$labels[$l]);
          if (partition_distance($q1,$q2) != 0) {
            $dist += 2;
          }
        }
      }
    }
  }
  return $dist;
}
quartets.distance <- function (t1,t2) {
  L <- sort(t1$tip.label)
  d <- 0
  for (i in L[1:(length(L)-3)]) {
    for (j in L[(match(i,L)+1):(length(L)-2)]) {
      for (k in L[(match(j,L)+1):(length(L)-1)]) {
        for (l in L[(match(k,L)+1):length(L)]) {
          q1 <- quartet(t1,i,j,k,l)
          q2 <- quartet(t2,i,j,k,l)
          if (dist.topo(q1,q2) != 0) { d <- d + 2 }
        }
      }
    }
  }
  d
}

Chapter 8: Graphs

Section 8.1: Graphs in Mathematics

Section 8.1.1: Counting Labeled Graphs

Section 8.2: Graphs in Computer Science

Section 8.2.1: Traversing Directed Graphs

Section 8.3: Graphs in Computational Biology

Section 8.3.1: The eNewick Linear Representation
Section 8.3.2: Counting Phylogenetic Networks
Section 8.3.3: Generating Phylogenetic Networks
Section 8.3.4: Representing Graphs in Perl
Section 8.3.5: Representing Graphs in R

Chapter 9: Simple Pattern Matching in Graphs

Section 9.1: Finding Paths in Graphs


network.height <- function (net) {
  V(net)$height <- 0
  V(net)$visited <- FALSE
  Q <- as.vector(V(net)[which(degree(net,mode="out")==0)-1])
  while (length(Q)>0) {
    j <- Q[1]
    Q <- Q[-1]
    V(net)[j]$visited <- TRUE
    I <- as.vector(V(net)[nei(j,mode="in")])
    for (i in I) {
      V(net)[i]$height <- max(V(net)[i]$height,V(net)[j]$height+1)
      if (all(V(net)[nei(i,mode="out")]$visited)) Q <- c(Q,i)
    }
  }
  V(net)$height
}
Section 9.1.1: Distances in Graphs
use Bio::PhyloNetwork;

sub is_strict_ancestor {
  my $net = shift;
  my $u = shift;
  my $v = shift;
  my $dag = $net->graph->copy_graph;

  my @roots = $dag->source_vertices;
  my $root = shift @roots;

  $dag->delete_vertex($u);
  return !$dag->is_reachable($root,$v);
}
library(igraph)

is.strict.ancestor <- function (net,i,j) {
  if (length(get.all.shortest.paths(net,
      V(net)[i],V(net)[j],mode="out")) == 0) return(FALSE)
  r <- V(net)[which(degree(net,mode="in")==0)-1]
  if (i == r || i == j) return(TRUE)
  net <- delete.vertices(net,i)
  if (i < j) j <- j - 1 # account for deleted node
  length(get.all.shortest.paths(net,r,V(net)[j],mode="out")) == 0
}

is.ancestor <- function (net,i,j) {
  length(get.all.shortest.paths(net,V(net)[i],V(net)[j],mode="out")) != 0
}
sub CSA {
  my $net = shift;
  my $v = shift;
  my $w = shift;
  my $dag = $net->graph;

  my @csa = ();
  foreach my $u ( $net->nodes ) {
    if ( $dag->is_reachable($u,$v) and $dag->is_reachable($u,$w) ) {
      if (is_strict_ancestor($net,$u,$v)
          or is_strict_ancestor($net,$u,$w)) {
        push @csa, $u;
      }
    }
  }

  return \@csa;
}
CSA <- function (net,i,j) {
  strict <- lapply(V(net),function (u) is.ancestor(net,u,i)
      && is.ancestor(net,u,j) && (is.strict.ancestor(net,u,i)
      || is.strict.ancestor(net,u,j)))
  V(net)[unlist(strict)]
}
sub LCSA {
  my $net = shift;
  my $v = shift;
  my $w = shift;

  my %height = $net->heights;
  my @CSA = @{ CSA($net,$v,$w) };

  my $lcsa = shift @CSA;
  for my $node ( @CSA ) {
    if ( $height{$node} < $height{$lcsa} ) {
      $lcsa = $node;
    }
  }

  return $lcsa;
}
LCSA <- function (net,i,j) {
  csa <- match(CSA(net,i,j),V(net))
  V(net)[csa[which.min(network.height(net)[csa])]-1]
}
Section 9.1.2: The Path Multiplicity Distance between Graphs

path.multiplicity <- function (net) {
  leaves <- V(net)[which(degree(net,mode="out")==0)-1]
  mu <- matrix(0,nrow=vcount(net),ncol=length(leaves),
      dimnames=list(sort(V(net)$name),sort(leaves$name)))
  for (v in sort(V(net)[leaves]$name)) mu[v,v] <- 1
  V(net)$visited <- FALSE
  Q <- as.vector(V(net)[which(degree(net,mode="out")==0)-1])
  while (length(Q)>0) {
    j <- Q[1]
    Q <- Q[-1]
    V(net)[j]$visited <- TRUE
    I <- as.vector(V(net)[nei(j,mode="in")])
    for (i in I) {
      mu[V(net)[i]$name,] <- mu[V(net)[i]$name,] + mu[V(net)[j]$name,]
      if (all(V(net)[nei(i,mode="out")]$visited)) Q <- c(Q,i)
    }
  }
  mu
}

lex.cmp <- function(vec1,vec2) {
  index <- which.min(vec1 == vec2)
  as.numeric(sign(vec1[index] - vec2[index]))
}

path.multiplicity.distance <- function (net1,net2) {
  mu1 <- path.multiplicity(net1)
  mu1 <- mu1[do.call("order",lapply(1:ncol(mu1),function (j) mu1[,j])),]
  mu2 <- path.multiplicity(net2)
  mu2 <- mu2[do.call("order",lapply(1:ncol(mu2),function (j) mu2[,j])),]
  i1 <- 1
  i2 <- 1
  c <- 0
  while (i1 <= nrow(mu1) && i2 <= nrow(mu2)) {
    lex <- lex.cmp(mu1[i1,],mu2[i2,])
    if ( lex == -1 ) {
      i1 <- i1 + 1
    } else if ( lex == 1 ) {
      i2 <- i2 + 1
    } else {
      i1 <- i1 + 1
      i2 <- i2 + 1
      c <- c + 1
    }
  }
  nrow(mu1)+nrow(mu2)-2*c
}
Section 9.1.3: The Tripartition Distance between Graphs

tripartitions <- function (net) {
  leaves <- V(net)[which(degree(net,mode="out")==0)-1]
  theta <- matrix("",nrow=vcount(net),ncol=length(leaves),
      dimnames=list(sort(V(net)$name),sort(leaves$name)))
  for (i in V(net)) {
    for (j in leaves) {
      if ( is.ancestor(net,i,j) ) {
        if ( is.strict.ancestor(net,i,j) ) {
          theta[V(net)[i]$name,V(net)[j]$name] <- "A"
        } else {
          theta[V(net)[i]$name,V(net)[j]$name] <- "B"
        }
      } else {
        theta[V(net)[i]$name,V(net)[j]$name] <- "C"
      }
    }
  }
  theta
}
use Bio::PhyloNetwork;

sub tripartition_distance {
  my $net1 = shift;
  my $net2 = shift;

  $net1->compute_tripartitions() unless defined $net1->{tripartitions};
  $net2->compute_tripartitions() unless defined $net2->{tripartitions};

  my @tri1 = sort map {$net1->{tripartitions}->{$_}} $net1->nodes;
  my @tri2 = sort map {$net2->{tripartitions}->{$_}} $net2->nodes;

  my $i1 = 0;
  my $i2 = 0;
  my $c = 0;
  while ($i1 < scalar @tri1 && $i2 < scalar @tri2) {
    if ($tri1[$i1] lt $tri2[$i2]) {
      $i1++;
    } elsif ($tri1[$i1] gt $tri2[$i2]) {
      $i2++;
    } else {
      $i1++;
      $i2++;
      $c++;
    }
  }

  return scalar @tri1+scalar @tri2-2*$c;
}
library(igraph)

lex.cmp.char <- function (vec1,vec2) {
  for (j in 1:length(vec1)) {
    if (vec1[j] < vec2[j]) { return(-1) }
    if (vec1[j] > vec2[j]) { return(1) }
  }
  return(0)
}

tripartition.distance <- function (net1,net2) {
  theta1 <- tripartitions(net1)
  theta1 <- theta1[do.call("order",lapply(1:ncol(theta1),
      function (j) theta1[,j])),]
  theta2 <- tripartitions(net2)
  theta2 <- theta2[do.call("order",lapply(1:ncol(theta2),
      function (j) theta2[,j])),]
  i1 <- 1
  i2 <- 1
  c <- 0
  while (i1 <= nrow(theta1) && i2 <= nrow(theta2)) {
    lex <- lex.cmp.char(theta1[i1,],theta2[i2,])
    if ( lex == -1 ) {
      i1 <- i1 + 1
    } else if ( lex == 1 ) {
      i2 <- i2 + 1
    } else {
      i1 <- i1 + 1
      i2 <- i2 + 1
      c <- c + 1
    }
  }
  nrow(theta1)+nrow(theta2)-2*c
}
Section 9.1.4: The Nodal Distance between Graphs
sub nodal_distance {
  my $net1 = shift;
  my $net2 = shift;
  my @L = $net1->leaves;
  my $dist = 0;
  for (my $i = 0; $i < @L; $i++) {
    for (my $j = $i+1; $j < @L; $j++) {
      my $lcsa1 = LCSA($net1,$L[$i],$L[$j]);
      my $lcsa2 = LCSA($net2,$L[$i],$L[$j]);
      my @p1 = $net1->graph->SP_Dijkstra($lcsa1,$L[$i]);
      my @p2 = $net2->graph->SP_Dijkstra($lcsa2,$L[$i]);
      my $d1 = @p1 - 1;
      my $d2 = @p2 - 1;
      $dist += abs($d1 - $d2);
      @p1 = $net1->graph->SP_Dijkstra($lcsa1,$L[$j]);
      @p2 = $net2->graph->SP_Dijkstra($lcsa2,$L[$j]);
      $d1 = @p1 - 1;
      $d2 = @p2 - 1;
      $dist += abs($d1 - $d2);
    }
  }
  return $dist;
}
library(igraph)

nodal.distance <- function (net1,net2) {
  l1 <- V(net1)[which(degree(net1,mode="out")==0)-1]
  n1 <- matrix(0,nrow=length(l1),ncol=length(l1),
      dimnames=list(sort(l1$name),sort(l1$name)))
  l2 <- V(net2)[which(degree(net2,mode="out")==0)-1]
  n2 <- matrix(0,nrow=length(l2),ncol=length(l2),
      dimnames=list(sort(l2$name),sort(l2$name)))
  for (i in l1) {
    for (j in c(l1)[match(i,l1):length(l1)]) {
      lcsa <- LCSA(net1,i,j)
      paths <- get.shortest.paths(net1,lcsa,i,mode="out")
      n1[V(net1)[i]$name,V(net1)[j]$name] <- length(paths[[1]])-1
      paths <- get.shortest.paths(net1,lcsa,j,mode="out")
      n1[V(net1)[j]$name,V(net1)[i]$name] <- length(paths[[1]])-1
    }
  }
  for (i in l2) {
    for (j in c(l2)[match(i,l2):length(l2)]) {
      lcsa <- LCSA(net2,i,j)
      paths <- get.shortest.paths(net2,lcsa,i,mode="out")
      n2[V(net2)[i]$name,V(net2)[j]$name] <- length(paths[[1]])-1
      paths <- get.shortest.paths(net2,lcsa,j,mode="out")
      n2[V(net2)[j]$name,V(net2)[i]$name] <- length(paths[[1]])-1
    }
  }
  sum(abs(n1-n2))
}

Section 9.2: Finding Trees in Graphs

sub explode_rec {
  my ($self,$trees) = @_;
  my @h = $self->hybrid_nodes;
  if (scalar @h) {
    my $v = shift @h;
    for my $u ($self->{graph}->predecessors($v)) {
      $self->{graph}->delete_edge($u,$v);
      $self->explode_rec($trees);
      $self->{graph}->add_edge($u,$v);
    }
  } else {
    my $io = IO::String->new($self->eNewick);
    my $treeio = Bio::TreeIO->new(-format => 'newick', -fh => $io);
    my $tree = $treeio->next_tree;
    $tree->contract_linear_paths;
    push @{$trees}, $tree;
  }
}

sub explode {
  my ($self) = @_;
  my @trees;
  $self->explode_rec(\@trees);
  return @trees;
}
network.to.newick <- function (net) {
  r <- V(net)[which(degree(net,mode="in")==0)-1]
  newick <- obtain.newick(net,r,c())
  paste(c(newick,";"),collapse="")
}

obtain.newick <- function (net,v,newick) {
  if (length(V(net)[nei(v,"out")]) != 0) {
    if ( length(V(net)[nei(v,"in")]) != 0
        && match(v,V(net)[nei(V(net)[nei(v,"in")],"out")]) != 1 )
      newick <- c(newick,",")
    newick <- c(newick,"(")
  }
  for (w in V(net)[nei(v,"out")]) {
    newick <- obtain.newick(net,w,newick)
  }
  if (length(V(net)[nei(v,"out")]) != 0)
    newick <- c(newick,")")
  if ( length(V(net)[nei(v,"in")]) != 0
      && length(V(net)[nei(v,"out")]) == 0
      && match(v,V(net)[nei(V(net)[nei(v,"in")],"out")]) != 1 )
    newick <- c(newick,",")
  c(newick,V(net)[v]$name)
}

network.to.tree <- function (net) {
  V(net)$del <- FALSE
  E(net)$del <- FALSE
  r <- V(net)[which(degree(net,mode="in")==0)-1]
  net <- postorder.traversal(net,r)
  net <- delete.vertices(net,V(net)[del])
  net <- delete.edges(net,E(net)[del])
  network.to.newick(net)
}

postorder.traversal <- function (net,v) {
  for (w in V(net)[nei(v,"out")]) {
    net <- postorder.traversal(net,w)
  }
  VW <- E(net)[from == v & del==FALSE]
  if (length(c(VW)) == 1) {
    u <- V(net)[nei(v,"in")]
    w <- get.edge(net,VW)[2]
    E(net,path=c(V(net)[u],V(net)[v]))$del <- TRUE
    E(net,path=c(V(net)[v],V(net)[w]))$del <- TRUE
    V(net)[v]$del <- TRUE
    net <- add.edges(net,c(V(net)[u],V(net)[w]))
    E(net,path=c(V(net)[u],V(net)[w]))$del <- FALSE
  }
  net
}

explode <- function (net) {
  trees <- c()
  H <- V(net)[which(degree(net,mode="in")>1)-1]
  if (length(H) == 0) {
    trees <- c(trees,network.to.tree(net))
  } else {
    v <- c(H)[1]
    for (u in V(net)[nei(v,"in")]) {
      new <- delete.edges(net,E(net,P=c(u,v)))
      trees <- c(trees,explode(new))
    }
  }
  trees
}
Section 9.2.1: The Statistical Error between Graphs
sub error_ratio {
  my $net1 = shift;
  my $net2 = shift;
  my @trees1 = $net1->explode;
  my @trees2 = $net2->explode;
  my $FN = 0;
  my $FP = 0;
  for my $tree1 (@trees1) {
    my $found = 0; # false
    for my $tree2 (@trees2) {
      if (partition_distance($tree1,$tree2) == 0) {
        $found = 1; # true
      }
    }
    if (!$found) { $FN++; }
  }
  for my $tree2 (@trees2) {
    my $found = 0; # false
    for my $tree1 (@trees1) {
      if (partition_distance($tree1,$tree2) == 0) {
        $found = 1; # true
      }
    }
    if (!$found) { $FP++; }
  }
  return (($FN/@trees1)+($FP/@trees2))/2;
}
error.rate <- function (net1,net2) {
  trees1 <- explode(net1)
  trees2 <- explode(net2)
  FN <- 0
  FP <- 0
  for (t1 in trees1) {
    tree1 <- read.tree(text=t1)
    found <- FALSE
    for (t2 in trees2) {
      tree2 <- read.tree(text=t2)
      if (dist.topo(tree1,tree2) == 0) {
        found <- TRUE
        break
      }
    }
    if (!found) FN <- FN+1
  }
  for (t2 in trees2) {
    tree2 <- read.tree(text=t2)
    found <- FALSE
    for (t1 in trees1) {
      tree1 <- read.tree(text=t1)
      if (dist.topo(tree1,tree2) == 0) {
        found <- TRUE
        break
      }
    }
    if (!found) FP <- FP+1
  }
  (FN/length(trees1)+FP/length(trees2))/2
}

Chapter 10: General Pattern Matching in Graphs

Section 10.1: Finding Subgraphs

Section 10.1.1: Finding Subgraphs Induced by Triplets
sub triplet {
  my ($net,@triple) = @_;
  my ($i,$j,$k) = @triple;

  my @L = $net->leaves;
  my $ij = LCSA($net,$L[$i],$L[$j]);
  my $ik = LCSA($net,$L[$i],$L[$k]);
  my $jk = LCSA($net,$L[$j],$L[$k]);

  my $t;
  if ($net->graph->SP_Dijkstra($ij,$ik)) {
    if ($net->graph->SP_Dijkstra($ik,$ij)) {
      $t = "(($L[$j],$L[$k]),$L[$i]);";
    } else {
      if ($net->graph->SP_Dijkstra($jk,$ij)) {
        $t = "(($L[$i],$L[$k]),$L[$j]);";
      } else {
        $t = "(($L[$i],($L[$k])#H1),(#H1,$L[$j]));";
      }
    }
  } else {
    if ($net->graph->SP_Dijkstra($ik,$ij)) {
      if ($net->graph->SP_Dijkstra($jk,$ij)) {
        $t = "(($L[$i],$L[$j]),$L[$k]);";
      } else {
        $t = "(($L[$i],($L[$j])#H1),(#H1,$L[$k]));";
      }
    } else {
      $t = "(($L[$j],($L[$i])#H1),(#H1,$L[$k]));";
    }
  }
  return $t;
}
triplet <- function (net,i,j,k) {
    ij <- LCSA(net,i,j)
    ik <- LCSA(net,i,k)
    jk <- LCSA(net,j,k)

    ij.ancestor.of.ik <- length(get.all.shortest.paths(net,
        ij,ik,mode="out")) != 0
    ik.ancestor.of.ij <- length(get.all.shortest.paths(net,
        ik,ij,mode="out")) != 0
    ij.ancestor.of.jk <- length(get.all.shortest.paths(net,
        ij,jk,mode="out")) != 0
    jk.ancestor.of.ij <- length(get.all.shortest.paths(net,
        jk,ij,mode="out")) != 0

    if (ij.ancestor.of.ik)
      if (ik.ancestor.of.ij)
        paste("((",V(net)[j]$name,",",V(net)[k]$name,"),",
            V(net)[i]$name,");",sep="")
      else
        if (jk.ancestor.of.ij)
          paste("((",V(net)[i]$name,",",V(net)[k]$name,"),",
              V(net)[j]$name,");",sep="")
        else
          paste("((",V(net)[i]$name,",(",V(net)[k]$name,"))#H,(#H,",
              V(net)[j]$name,"));",sep="")
    else
      if (ik.ancestor.of.ij)
        if (jk.ancestor.of.ij)
          paste("((",V(net)[i]$name,",",V(net)[j]$name,"),",
              V(net)[k]$name,");",sep="")
        else
          paste("((",V(net)[i]$name,",(",V(net)[j]$name,"))#H,(#H,",
              V(net)[k]$name,"));",sep="")
      else
        paste("((",V(net)[j]$name,",(",V(net)[i]$name,"))#H,(#H,",
            V(net)[k]$name,"));",sep="")
}
sub triplet {
  my ($net,@triple) = @_;
  my ($i,$j,$k) = @triple;

  my @L = $net->leaves;
  my $ij = LCSA($net,$L[$i],$L[$j]);
  my $ik = LCSA($net,$L[$i],$L[$k]);
  my $jk = LCSA($net,$L[$j],$L[$k]);

  my @R;
  $R[0] = 1;
  $R[1] = $net->graph->SP_Dijkstra($ij,$ik) ? 1 : 0;
  $R[2] = $net->graph->SP_Dijkstra($ij,$jk) ? 1 : 0;
  $R[3] = $net->graph->SP_Dijkstra($ik,$ij) ? 1 : 0;
  $R[4] = 1;
  $R[5] = $net->graph->SP_Dijkstra($ik,$jk) ? 1 : 0;
  $R[6] = $net->graph->SP_Dijkstra($jk,$ij) ? 1 : 0;
  $R[7] = $net->graph->SP_Dijkstra($jk,$ik) ? 1 : 0;
  $R[8] = 1;

  return join "", @R;
}
triplet <- function (net,i,j,k) {
  ij <- LCSA(net,i,j)
  ik <- LCSA(net,i,k)
  jk <- LCSA(net,j,k)

  L <- c(V(net)[i]$name,V(net)[j]$name,V(net)[k]$name)
  R <- matrix(rep(NA,9),nrow=3,dimnames=list(L,L))
  R[1,1] <- TRUE
  R[1,2] <- length(get.all.shortest.paths(net,ij,ik,mode="out")) != 0
  R[1,3] <- length(get.all.shortest.paths(net,ij,jk,mode="out")) != 0
  R[2,1] <- length(get.all.shortest.paths(net,ik,ij,mode="out")) != 0
  R[2,2] <- TRUE
  R[2,3] <- length(get.all.shortest.paths(net,ik,jk,mode="out")) != 0
  R[3,1] <- length(get.all.shortest.paths(net,jk,ij,mode="out")) != 0
  R[3,2] <- length(get.all.shortest.paths(net,jk,ik,mode="out")) != 0
  R[3,3] <- TRUE

  R
}

Section 10.2: Finding Common Subgraphs

Section 10.2.1: Maximum Agreement of Rooted Networks
sub bottom_up_subgraph {
  my $graph = shift;
  my $node = shift;
  my $sub = Graph->new;
  my @Q = ($node);
  while (@Q) {
    $node = shift @Q;
    $sub->add_vertex($node);
    push @Q, $graph->successors($node);
  }
  for my $edge ($graph->edges) {
    if ($sub->has_vertex(@$edge[0]) &&
        $sub->has_vertex(@$edge[1])) {
      $sub->add_edge(@$edge[0],@$edge[1]);
    }
  }
  while (my @V = grep {
      $sub->in_degree($_)==1 &&
      $sub->out_degree($_)==1 }
      $sub->interior_vertices) {
    my $v = shift @V;
    my @U = $sub->predecessors($v);
    my $u = shift @U;
    my @W = $sub->successors($v);
    my $w = shift @W;
    $sub->add_edge($u,$w);
    $sub->delete_vertex($v);
  }
  return $sub;
}
bottom.up.subgraph <- function (net,i) {
  Q <- c(i)
  S <- c()
  while (length(Q)>0) {
    i <- Q[1]
    Q <- Q[-1]
    S <- c(S,i)
    Q <- c(Q,as.vector(V(net)[nei(i,mode="out")]))
  }
  g <- subgraph(net,unique(S))
  while (length(V(g)[which(degree(g,mode="in")==1 &
      degree(g,mode="out")==1)-1])>0) {
    D <- V(g)[which(degree(g,mode="in")==1 &
      degree(g,mode="out")==1)-1]
    v <- head(c(D),n=1) # elementary node
    u <- V(g)[nei(v,"in")]
    w <- V(g)[nei(v,"out")]
    g <- add.edges(g,c(V(g)[u],V(g)[w]))
    g <- delete.vertices(g,V(g)[v])
  }
  g
}
sub graph_isomorphic {
  my $g1 = shift;
  my $g2 = shift;
  my $found = 0;
  if ($g1->could_be_isomorphic($g2)) {
    my $C1 = cluster_map($g1);
    my $C2 = cluster_map($g2);
    my %X;
    for my $node1 ($g1->vertices) {
      for my $node2 ($g2->vertices) {
        my $lc = List::Compare->new(@{$C1}{$node1},
          @{$C2}{$node2});
        if ($lc->is_LequivalentR) {
          push @{$X{$node1}}, $node2;
        }
      }
    }
    my $node1 = undef;
    my %M;
    my @V1 = sort $g1->vertices;
    backtrack($g1,$g2,\%X,$node1,\%M,\@V1,\$found);
  }
  return $found;
}

sub backtrack {
  my ($g1,$g2,$X,$node1,$M,$V1,$found) = @_;
  my %X = %{$X};
  my %M = %{$M};
  my @V1 = @{$V1};
  unless ($$found) {
    for my $prev1 (keys %M) {
      if ($g1->has_edge($prev1,$node1) &&
          !$g2->has_edge($M{$prev1},$M{$node1}) ||
        $g1->has_edge($node1,$prev1) &&
          !$g2->has_edge($M{$node1},$M{$prev1}) ||
        !$g1->has_edge($prev1,$node1) &&
          $g2->has_edge($M{$prev1},$M{$node1}) ||
        !$g1->has_edge($node1,$prev1) &&
          $g2->has_edge($M{$node1},$M{$prev1})) {
        return;
      }
    }
    if (scalar @V1) {
      $node1 = shift @V1;
      for my $node2 (@{$X{$node1}}) {
        $M{$node1} = $node2;
        backtrack($g1,$g2,$X,$node1,\%M,\@V1,$found);
        $M{$node1} = undef;
      }
    } else {
      $$found = 1;
    }
  }
}

sub cluster_map {
  my $graph = shift;
  my %C;
  for my $node ($graph->vertices) {
    for my $leaf ($graph->successorless_vertices) {
      if ($graph->is_reachable($node,$leaf)) {
        push @{$C{$node}}, $leaf;
      }
    }
  }
  return \%C;
}

sub max_agreement_subgraph {
  my $net1 = shift;
  my $net2 = shift;
  my $graph1 = $net1->graph;
  my $graph2 = $net2->graph;
  my $common = Graph->new;
  my $size = 0;
  for my $node1 ($graph1->interior_vertices) {
    my $sub1 = subgraph($graph1,$node1);
    for my $node2 ($graph2->interior_vertices) {
      my $sub2 = subgraph($graph2,$node2);
      if (graph_isomorphic($sub1,$sub2)) {
        my $leaves1 = $sub1->successorless_vertices;
        if ($leaves1 > $size) {
          $common = $sub1;
          $size = $leaves1;
        }
      }
    }
  }
  return $common;
}
max.agreement.subgraph <- function (net1,net2) {
  max <- graph.empty()
  max.size <- 0
  for (i in V(net1)) {
    g1 <- bottom.up.subgraph(net1,i)
    for (j in V(net2)) {
      g2 <- bottom.up.subgraph(net2,j)
      if (graph.isomorphic(g1,g2)) {
        L <- V(g1)[which(degree(g1,mode="out")==0)-1]
        if (length(L)>max.size) {
          max <- g1
          max.size <- length(L)
        }
      }
    }
  }
  max
}

Section 10.3: Comparing Graphs

Section 10.3.1: The Triplets Distance between Graphs
sub triplets_distance {
  my $net1 = shift;
  my $net2 = shift;
  my $comp = Array::Compare->new;
  my @L = $net1->leaves;
  my $dist = 0;
  for (my $i = 0; $i < @L; $i++) {
    for (my $j = $i+1; $j < @L; $j++) {
      for (my $k = $j+1; $k < @L; $k++) {
        my $t1 = triplet($net1,$i,$j,$k);
        my $t2 = triplet($net2,$i,$j,$k);
	$dist += 2 unless ($t1 eq $t2);
      }
    }
  }
  return $dist;
}
triplets.distance <- function (net1,net2) {
  L <- sort(V(net1)[which(degree(net1,mode="out")==0)-1]$name)
  d <- 0
  for (i in L[1:(length(L)-2)]) {
    i1 <- V(net1)[V(net1)$name == i]
    i2 <- V(net2)[V(net2)$name == i]
    for (j in L[(match(i,L)+1):(length(L)-1)]) {
      j1 <- V(net1)[V(net1)$name == j]
      j2 <- V(net2)[V(net2)$name == j]
      for (k in L[(match(j,L)+1):length(L)]) {
        k1 <- V(net1)[V(net1)$name == k]
        k2 <- V(net2)[V(net2)$name == k]
        t1 <- triplet(net1,i1,j1,k1)
        t2 <- triplet(net2,i2,j2,k2)
        if (!isTRUE(all.equal(t1,t2))) { d <- d + 2 }
      }
    }
  }
  d
}

Appendix A: Elements of Perl

Section A.1: Perl Scripts

sub rna_to_protein {
  my $rna = shift;
  my $protein;
  my $i = 0;
  while ($i < length($rna) - 2 &&
      substr($rna,$i,3) ne "AUG") { # start codon
    $i++;
  }
  $i += 3; # skip the start codon
  while ($i < length($rna) - 2 &&
      substr($rna,$i,3) ne "UAA" &&
      substr($rna,$i,3) ne "UAG" &&
      substr($rna,$i,3) ne "UGA") {
    my $codon = substr($rna,$i,3);
    $protein .= codon_to_amino_acid($codon);
    $i += 3;
  }
  return $protein;
}

sub codon_to_amino_acid {
  my $codon = shift;
  if    ($codon eq "UUU") { return "F" } # Phe
  elsif ($codon eq "UUC") { return "F" } # Phe
  elsif ($codon eq "UUA") { return "L" } # Leu
  elsif ($codon eq "UUG") { return "L" } # Leu
  elsif ($codon eq "UCU") { return "S" } # Ser
  elsif ($codon eq "UCC") { return "S" } # Ser
  elsif ($codon eq "UCA") { return "S" } # Ser
  elsif ($codon eq "UCG") { return "S" } # Ser
  elsif ($codon eq "UAU") { return "Y" } # Tyr
  elsif ($codon eq "UAC") { return "Y" } # Tyr
  elsif ($codon eq "UAA") { return "-" } # stop
  elsif ($codon eq "UAG") { return "-" } # stop
  elsif ($codon eq "UGU") { return "C" } # Cys
  elsif ($codon eq "UGC") { return "C" } # Cys
  elsif ($codon eq "UGA") { return "-" } # stop
  elsif ($codon eq "UGG") { return "W" } # Trp
  elsif ($codon eq "CUU") { return "L" } # Leu
  elsif ($codon eq "CUC") { return "L" } # Leu
  elsif ($codon eq "CUA") { return "L" } # Leu
  elsif ($codon eq "CUG") { return "L" } # Leu
  elsif ($codon eq "CCU") { return "P" } # Pro
  elsif ($codon eq "CCC") { return "P" } # Pro
  elsif ($codon eq "CCA") { return "P" } # Pro
  elsif ($codon eq "CCG") { return "P" } # Pro
  elsif ($codon eq "CAU") { return "H" } # His
  elsif ($codon eq "CAC") { return "H" } # His
  elsif ($codon eq "CAA") { return "Q" } # Gln
  elsif ($codon eq "CAG") { return "Q" } # Gln
  elsif ($codon eq "CGU") { return "R" } # Arg
  elsif ($codon eq "CGC") { return "R" } # Arg
  elsif ($codon eq "CGA") { return "R" } # Arg
  elsif ($codon eq "CGG") { return "R" } # Arg
  elsif ($codon eq "AUU") { return "I" } # Ile
  elsif ($codon eq "AUC") { return "I" } # Ile
  elsif ($codon eq "AUA") { return "I" } # Ile
  elsif ($codon eq "AUG") { return "M" } # Met
  elsif ($codon eq "ACU") { return "T" } # Thr
  elsif ($codon eq "ACC") { return "T" } # Thr
  elsif ($codon eq "ACA") { return "T" } # Thr
  elsif ($codon eq "ACG") { return "T" } # Thr
  elsif ($codon eq "AAU") { return "N" } # Asn
  elsif ($codon eq "AAC") { return "N" } # Asn
  elsif ($codon eq "AAA") { return "K" } # Lys
  elsif ($codon eq "AAG") { return "K" } # Lys
  elsif ($codon eq "AGU") { return "S" } # Ser
  elsif ($codon eq "AGC") { return "S" } # Ser
  elsif ($codon eq "AGA") { return "R" } # Arg
  elsif ($codon eq "AGG") { return "R" } # Arg
  elsif ($codon eq "GUU") { return "V" } # Val
  elsif ($codon eq "GUC") { return "V" } # Val
  elsif ($codon eq "GUA") { return "V" } # Val
  elsif ($codon eq "GUG") { return "V" } # Val
  elsif ($codon eq "GCU") { return "A" } # Ala
  elsif ($codon eq "GCC") { return "A" } # Ala
  elsif ($codon eq "GCA") { return "A" } # Ala
  elsif ($codon eq "GCG") { return "A" } # Ala
  elsif ($codon eq "GAU") { return "D" } # Asp
  elsif ($codon eq "GAC") { return "D" } # Asp
  elsif ($codon eq "GAA") { return "E" } # Glu
  elsif ($codon eq "GAG") { return "E" } # Glu
  elsif ($codon eq "GGU") { return "G" } # Gly
  elsif ($codon eq "GGC") { return "G" } # Gly
  elsif ($codon eq "GGA") { return "G" } # Gly
  elsif ($codon eq "GGG") { return "G" } # Gly
  else { return "*" }
}

my %codon2aa = (
  "UUU" => "F", # Phe
  "UUC" => "F", # Phe
  "UUA" => "L", # Leu
  "UUG" => "L", # Leu
  "UCU" => "S", # Ser
  "UCC" => "S", # Ser
  "UCA" => "S", # Ser
  "UCG" => "S", # Ser
  "UAU" => "Y", # Tyr
  "UAC" => "Y", # Tyr
  "UAA" => "-", # stop
  "UAG" => "-", # stop
  "UGU" => "C", # Cys
  "UGC" => "C", # Cys
  "UGA" => "-", # stop
  "UGG" => "W", # Trp
  "CUU" => "L", # Leu
  "CUC" => "L", # Leu
  "CUA" => "L", # Leu
  "CUG" => "L", # Leu
  "CCU" => "P", # Pro
  "CCC" => "P", # Pro
  "CCA" => "P", # Pro
  "CCG" => "P", # Pro
  "CAU" => "H", # His
  "CAC" => "H", # His
  "CAA" => "Q", # Gln
  "CAG" => "Q", # Gln
  "CGU" => "R", # Arg
  "CGC" => "R", # Arg
  "CGA" => "R", # Arg
  "CGG" => "R", # Arg
  "AUU" => "I", # Ile
  "AUC" => "I", # Ile
  "AUA" => "I", # Ile
  "AUG" => "M", # Met
  "ACU" => "T", # Thr
  "ACC" => "T", # Thr
  "ACA" => "T", # Thr
  "ACG" => "T", # Thr
  "AAU" => "N", # Asn
  "AAC" => "N", # Asn
  "AAA" => "K", # Lys
  "AAG" => "K", # Lys
  "AGU" => "S", # Ser
  "AGC" => "S", # Ser
  "AGA" => "R", # Arg
  "AGG" => "R", # Arg
  "GUU" => "V", # Val
  "GUC" => "V", # Val
  "GUA" => "V", # Val
  "GUG" => "V", # Val
  "GCU" => "A", # Ala
  "GCC" => "A", # Ala
  "GCA" => "A", # Ala
  "GCG" => "A", # Ala
  "GAU" => "D", # Asp
  "GAC" => "D", # Asp
  "GAA" => "E", # Glu
  "GAG" => "E", # Glu
  "GGU" => "G", # Gly
  "GGC" => "G", # Gly
  "GGA" => "G", # Gly
  "GGG" => "G"  # Gly
);

sub rna_to_protein {
  my $rna = shift;
  my $protein;
  my $aa;
  my $i = 0;
  while ($i < length($rna) - 2 &&
      substr($rna,$i,3) ne "AUG") { # start codon
    $i++;
  }
  $i += 3; # skip the start codon
  while ($i < length($rna) - 2 &&
      substr($rna,$i,3) ne "UAA" &&
      substr($rna,$i,3) ne "UAG" &&
      substr($rna,$i,3) ne "UGA") {
    my $codon = substr($rna,$i,3);
    if (defined $codon2aa{$codon}) {
      $aa = $codon2aa{$codon};
    } else {
      $aa = "*";
    }
    $protein .= $aa;
    $i += 3;
  }
  return $protein;
}

Section A.2: Overview of Perl

package DNA;

sub reverse_complement {
  my $seq = shift;
  $seq = reverse $seq;
  $seq =~ tr/ACGT/TGCA/;
  return $seq;
}

package RNA;

sub reverse_complement {
  my $seq = shift;
  $seq = reverse $seq;
  $seq =~ tr/ACGU/UGCA/;
  return $seq;
}

1;

Section A.3: Perl Quick Reference Card

Appendix B: Elements of R

Section B.1: R Scripts


rna.to.protein <- function (rna) {
  protein <- ""
  i <- 1
  while (i < nchar(rna)-1 &&
      substr(rna,i,i+2) != "AUG") { # start codon
    i <- i+1
  }
  i <- i+3 # skip the start codon
  while (i < nchar(rna)-1 &&
      substr(rna,i,i+2) != "UAA" &&
      substr(rna,i,i+2) != "UAG" &&
      substr(rna,i,i+2) != "UGA") {
    codon <- substr(rna,i,i+2)
    protein <- paste(protein,
      codon.to.amino.acid(codon),sep="")
    i <- i+3
  }
  return(protein)
}

codon.to.amino.acid <- function (codon) {
  if      (codon == "UUU") return("F") # Phe
  else if (codon == "UUC") return("F") # Phe
  else if (codon == "UUA") return("L") # Leu
  else if (codon == "UUG") return("L") # Leu
  else if (codon == "UCU") return("S") # Ser
  else if (codon == "UCC") return("S") # Ser
  else if (codon == "UCA") return("S") # Ser
  else if (codon == "UCG") return("S") # Ser
  else if (codon == "UAU") return("Y") # Tyr
  else if (codon == "UAC") return("Y") # Tyr
  else if (codon == "UAA") return("-") # stop
  else if (codon == "UAG") return("-") # stop
  else if (codon == "UGU") return("C") # Cys
  else if (codon == "UGC") return("C") # Cys
  else if (codon == "UGA") return("-") # stop
  else if (codon == "UGG") return("W") # Trp
  else if (codon == "CUU") return("L") # Leu
  else if (codon == "CUC") return("L") # Leu
  else if (codon == "CUA") return("L") # Leu
  else if (codon == "CUG") return("L") # Leu
  else if (codon == "CCU") return("P") # Pro
  else if (codon == "CCC") return("P") # Pro
  else if (codon == "CCA") return("P") # Pro
  else if (codon == "CCG") return("P") # Pro
  else if (codon == "CAU") return("H") # His
  else if (codon == "CAC") return("H") # His
  else if (codon == "CAA") return("Q") # Gln
  else if (codon == "CAG") return("Q") # Gln
  else if (codon == "CGU") return("R") # Arg
  else if (codon == "CGC") return("R") # Arg
  else if (codon == "CGA") return("R") # Arg
  else if (codon == "CGG") return("R") # Arg
  else if (codon == "AUU") return("I") # Ile
  else if (codon == "AUC") return("I") # Ile
  else if (codon == "AUA") return("I") # Ile
  else if (codon == "AUG") return("M") # Met
  else if (codon == "ACU") return("T") # Thr
  else if (codon == "ACC") return("T") # Thr
  else if (codon == "ACA") return("T") # Thr
  else if (codon == "ACG") return("T") # Thr
  else if (codon == "AAU") return("N") # Asn
  else if (codon == "AAC") return("N") # Asn
  else if (codon == "AAA") return("K") # Lys
  else if (codon == "AAG") return("K") # Lys
  else if (codon == "AGU") return("S") # Ser
  else if (codon == "AGC") return("S") # Ser
  else if (codon == "AGA") return("R") # Arg
  else if (codon == "AGG") return("R") # Arg
  else if (codon == "GUU") return("V") # Val
  else if (codon == "GUC") return("V") # Val
  else if (codon == "GUA") return("V") # Val
  else if (codon == "GUG") return("V") # Val
  else if (codon == "GCU") return("A") # Ala
  else if (codon == "GCC") return("A") # Ala
  else if (codon == "GCA") return("A") # Ala
  else if (codon == "GCG") return("A") # Ala
  else if (codon == "GAU") return("D") # Asp
  else if (codon == "GAC") return("D") # Asp
  else if (codon == "GAA") return("E") # Glu
  else if (codon == "GAG") return("E") # Glu
  else if (codon == "GGU") return("G") # Gly
  else if (codon == "GGC") return("G") # Gly
  else if (codon == "GGA") return("G") # Gly
  else if (codon == "GGG") return("G") # Gly
  else return("*")
}

codon2aa <- c(
  "UUU" = "F", # Phe
  "UUC" = "F", # Phe
  "UUA" = "L", # Leu
  "UUG" = "L", # Leu
  "UCU" = "S", # Ser
  "UCC" = "S", # Ser
  "UCA" = "S", # Ser
  "UCG" = "S", # Ser
  "UAU" = "Y", # Tyr
  "UAC" = "Y", # Tyr
  "UAA" = "-", # stop
  "UAG" = "-", # stop
  "UGU" = "C", # Cys
  "UGC" = "C", # Cys
  "UGA" = "-", # stop
  "UGG" = "W", # Trp
  "CUU" = "L", # Leu
  "CUC" = "L", # Leu
  "CUA" = "L", # Leu
  "CUG" = "L", # Leu
  "CCU" = "P", # Pro
  "CCC" = "P", # Pro
  "CCA" = "P", # Pro
  "CCG" = "P", # Pro
  "CAU" = "H", # His
  "CAC" = "H", # His
  "CAA" = "Q", # Gln
  "CAG" = "Q", # Gln
  "CGU" = "R", # Arg
  "CGC" = "R", # Arg
  "CGA" = "R", # Arg
  "CGG" = "R", # Arg
  "AUU" = "I", # Ile
  "AUC" = "I", # Ile
  "AUA" = "I", # Ile
  "AUG" = "M", # Met
  "ACU" = "T", # Thr
  "ACC" = "T", # Thr
  "ACA" = "T", # Thr
  "ACG" = "T", # Thr
  "AAU" = "N", # Asn
  "AAC" = "N", # Asn
  "AAA" = "K", # Lys
  "AAG" = "K", # Lys
  "AGU" = "S", # Ser
  "AGC" = "S", # Ser
  "AGA" = "R", # Arg
  "AGG" = "R", # Arg
  "GUU" = "V", # Val
  "GUC" = "V", # Val
  "GUA" = "V", # Val
  "GUG" = "V", # Val
  "GCU" = "A", # Ala
  "GCC" = "A", # Ala
  "GCA" = "A", # Ala
  "GCG" = "A", # Ala
  "GAU" = "D", # Asp
  "GAC" = "D", # Asp
  "GAA" = "E", # Glu
  "GAG" = "E", # Glu
  "GGU" = "G", # Gly
  "GGC" = "G", # Gly
  "GGA" = "G", # Gly
  "GGG" = "G"  # Gly
)

rna.to.protein <- function (rna) {
  protein <- ""
  i <- 1
  while (i < nchar(rna)-1 &&
      substr(rna,i,i+2) != "AUG") { # start codon
    i <- i+1
  }
  i <- i+3 # skip the start codon
  while (i < nchar(rna)-1 &&
      substr(rna,i,i+2) != "UAA" &&
      substr(rna,i,i+2) != "UAG" &&
      substr(rna,i,i+2) != "UGA") {
    codon <- substr(rna,i,i+2)
    if (is.na(codon2aa[codon])) {
      aa <- "*"
    } else {
      aa <- codon2aa[codon]
    }
    protein <- paste(protein,aa,sep="")
    i <- i+3
  }
  return(protein)
}

Section B.2: Overview of R


reverse.complement <- function (seq)
  UseMethod("reverse.complement")

reverse.complement.dna <- function (seq) {
  rev <- paste(rev(unlist(strsplit(seq,split=""))),sep="",collapse="")
  chartr("ACGT","TGCA",rev)
}

reverse.complement.rna <- function (seq) {
  rev <- paste(rev(unlist(strsplit(seq,split=""))),sep="",collapse="")
  chartr("ACGU","UGCA",rev)
}

Section B.3: R Quick Reference Card


Gabriel Valiente valiente@ (lsi.upc.edu)