1
/*
2
* This file is part of the diffson project.
3
*
4
* Licensed under the Apache License, Version 2.0 (the "License");
5
* you may not use this file except in compliance with the License.
6
* You may obtain a copy of the License at
7
*
8
* http://www.apache.org/licenses/LICENSE-2.0
9
*
10
* Unless required by applicable law or agreed to in writing, software
11
* distributed under the License is distributed on an "AS IS" BASIS,
12
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
* See the License for the specific language governing permissions and
14
* limitations under the License.
15
*/
16
package diffson.lcs
17

18
import cats.Eq
19
import cats.implicits._
20

21
import scala.annotation.tailrec
22
import scala.collection.SortedMap
23
import scala.collection.immutable.TreeMap
24
import scala.collection.compat._
25

26
/** Implementation of the patience algorithm [1] to compute the longest common subsequence
27
 *
28
 *  [1] http://alfedenzo.livejournal.com/170301.html
29
 *
30
 *  @param withFallback whether to fallback to classic LCS when patience could not find the LCS
31
 *  @author Lucas Satabin
32
 */
33
class Patience[T: Eq](withFallback: Boolean = true) extends Lcs[T] {
34

35
  // algorithm we fall back to when patience algorithm is unable to find the LCS
36
  private val classicLcs =
37 1
    if (withFallback) Some(new DynamicProgLcs[T]) else None
38

39
  /** An occurrence of a value associated to its index */
40
  type Occurrence = (T, Int)
41

42
  /** Returns occurrences that appear only once in the list, associated with their index */
43
  private def uniques(l: List[T]): Map[T, Int] = {
44
    @tailrec
45
    def loop(l: List[Occurrence], acc: Map[T, Int]): Map[T, Int] = l match {
46
      case (value, idx) :: tl =>
47 1
        if (acc.contains(value))
48
          // not unique, remove it from the accumulator and go further
49 1
          loop(tl, acc - value)
50
        else
51 1
          loop(tl, acc + (value -> idx))
52
      case Nil =>
53
        acc
54
    }
55 1
    loop(l.zipWithIndex, Map.empty)
56
  }
57

58
  /** Takes all occurences from the first sequence and order them as in the second sequence if it is present */
59
  private def common(l1: Map[T, Int], l2: Map[T, Int]): List[(Occurrence, Int)] = {
60
    @tailrec
61
    def loop(l: List[Occurrence], acc: List[(Occurrence, Int)]): List[(Occurrence, Int)] = l match {
62
      case occ :: tl =>
63
        // find the element in the second sequence if present
64 1
        l2.get(occ._1) match {
65 1
          case Some(idx2) => loop(tl, (occ -> idx2) :: acc)
66 1
          case None       => loop(tl, acc)
67
        }
68
      case Nil =>
69
        // sort by order of appearance in the second sequence
70 1
        acc sortWith (_._2 < _._2)
71
    }
72 1
    loop(l1.toList, Nil)
73
  }
74

75
  /** Returns the list of elements that appear only once in both l1 and l2 ordered as they appear in l2 with their index in l1 */
76
  private def uniqueCommons(seq1: List[T], seq2: List[T]): List[(Occurrence, Int)] = {
77
    // the values that occur only once in the first sequence
78 1
    val uniques1 = uniques(seq1)
79
    // the values that occur only once in the second sequence
80 1
    val uniques2 = uniques(seq2)
81
    // now order the unique occurrences as they appear in the second list
82 1
    common(uniques1, uniques2)
83
  }
84

85
  /** Returns the longest sequence */
86
  private def longest(l: List[(Occurrence, Int)]): List[(Int, Int)] = {
87 1
    if (l.isEmpty) {
88 1
      Nil
89 1
    } else {
90
      type Stack = List[Stacked]
91

92
      def push(idx1: Int, idx2: Int, stacks: TreeMap[Int, Stack]): TreeMap[Int, Stack] = {
93 1
        stacks.iteratorFrom(idx1).take(1).toList.headOption match {
94
          case None =>
95
            // corresponding stack not found, create a new one
96 1
            val chainCont = stacks.lastOption.flatMap(_._2.headOption)
97 1
            stacks.updated(idx1, Stacked(idx1, idx2, chainCont) :: Nil)
98
          case Some((idx, oldStack)) =>
99
            // we found the right stack, replace it by new version
100
            val chainCont = {
101
              // we have to find a previous stack
102
              // don't know how efficient `until` is...
103 1
              stacks.rangeUntil(idx).lastOption.flatMap(_._2.headOption)
104
            }
105 1
            (stacks - idx).updated(idx1, Stacked(idx1, idx2, chainCont) :: oldStack)
106
        }
107
      }
108

109
      def sort(l: List[(Occurrence, Int)]): TreeMap[Int, Stack] = {
110
        // foreach item push it onto earliest stack for which: stack.idx1 > item.idx1
111
        // or create a new stack for it if none can be found
112

113
        // stacks are kept in a treeMap (minValue -> stack)
114
        // it makes it efficient to find the correct stack to update
115

116 1
        l.foldLeft(TreeMap.empty[Int, Stack]) {
117
          case (acc, ((_, idx1), idx2)) =>
118 1
            push(idx1, idx2, acc)
119
        }
120
      }
121 1
      val sorted = sort(l)
122
      // this call is safe as we know that the list of occurrence is not empty here and that there are no empty stacks
123 1
      val greatest = sorted.last._2.head
124
      // make the lcs in increasing order
125 1
      greatest.chain
126
    }
127
  }
128

129
  /** Checks if two sequences have at least one common element */
130
  private def haveCommonElements(s1: List[T], s2: List[T]): Boolean = {
131 1
    val s2Set = s2.toSet
132 1
    s1.exists(s2Set)
133
  }
134

135
  /** Computes the longest common subsequence between both sequences.
136
   *  It is encoded as the list of common indices in the first and the second sequence.
137
   */
138
  def lcs(s1: List[T], s2: List[T], low1: Int, high1: Int, low2: Int, high2: Int): List[(Int, Int)] = {
139 1
    val seq1 = s1.slice(low1, high1)
140 1
    val seq2 = s2.slice(low2, high2)
141 1
    if (seq1.isEmpty || seq2.isEmpty) {
142
      // shortcut if at least on sequence is empty, the lcs, is empty as well
143 1
      Nil
144 1
    } else if (seq1 === seq2) {
145
      // both sequences are equal, the lcs is either of them
146 1
      seq1.indices.map(i => (i + low1, i + low2)).toList
147 1
    } else if (seq1.startsWith(seq2)) {
148
      // the second sequence is a prefix of the first one
149
      // the lcs is the second sequence
150 1
      seq2.indices.map(i => (i + low1, i + low2)).toList
151 1
    } else if (seq2.startsWith(seq1)) {
152
      // the first sequence is a prefix of the second one
153
      // the lcs is the first sequence
154 1
      seq1.indices.map(i => (i + low1, i + low2)).toList
155 1
    } else if (!haveCommonElements(seq1, seq2)) {
156
      // sequences have no common elements
157 1
      Nil
158 1
    } else {
159
      // fill the holes with possibly common (not unique) elements
160
      def loop(low1: Int, low2: Int, high1: Int, high2: Int, acc: List[(Int, Int)]): List[(Int, Int)] =
161 1
        if (low1 == high1 || low2 == high2) {
162 1
          acc
163 1
        } else {
164 1
          var lastPos1 = low1 - 1
165 1
          var lastPos2 = low2 - 1
166
          var answer = acc
167 1
          for ((p1, p2) <- longest(uniqueCommons(seq1.view.slice(low1, high1).toList, seq2.view.slice(low2, high2).toList))) {
168
            // recurse between lines which are unique in each sequence
169 1
            val pos1 = p1 + low1
170 1
            val pos2 = p2 + low2
171
            // most of the time we have sequences of similar entries
172 1
            if (lastPos1 + 1 != pos1 || lastPos2 + 1 != pos2)
173 1
              answer = loop(lastPos1 + 1, lastPos2 + 1, pos1, pos2, answer)
174
            lastPos1 = pos1
175
            lastPos2 = pos2
176 1
            answer = (pos1, pos2) :: answer
177
          }
178 1
          if (answer.size > acc.size) {
179
            // the size of the accumulator increased, find
180
            // matches between the last match and the end
181 1
            loop(lastPos1 + 1, lastPos2 + 1, high1, high2, answer)
182 1
          } else if (seq1(low1) === seq2(low2)) {
183
            // find lines that match at the beginning
184
            var newLow1 = low1
185
            var newLow2 = low2
186 1
            while (newLow1 < high1 && newLow2 < high2 && seq1(newLow1) === seq2(newLow2)) {
187 1
              answer = (newLow1, newLow2) :: answer
188 1
              newLow1 += 1
189 1
              newLow2 += 1
190
            }
191 1
            loop(newLow1, newLow2, high1, high2, answer)
192 1
          } else if (seq1(high1 - 1) === seq2(high2 - 1)) {
193
            // find lines that match at the end
194 1
            var newHigh1 = high1 - 1
195 1
            var newHigh2 = high2 - 1
196 1
            while (newHigh1 > low1 && newHigh2 > low2 && seq1(newHigh1 - 1) === seq2(newHigh2 - 1)) {
197 1
              newHigh1 -= 1
198 1
              newHigh2 -= 1
199
            }
200 1
            answer = loop(lastPos1 + 1, lastPos2 + 1, newHigh1, newHigh2, answer)
201 1
            for (i <- 0 until (high1 - newHigh1))
202 1
              answer = (newHigh1 + i, newHigh2 + i) :: answer
203
            answer
204
          } else {
205 1
            classicLcs match {
206
              case Some(classicLcs) =>
207
                // fall back to classic LCS algorithm when there is no unique common elements
208
                // between both sequences and they have no common prefix nor suffix
209
                // raw patience algorithm is not good for finding LCS in such cases
210 1
                classicLcs.lcs(seq1, seq2, low1, high1, low2, high2) reverse_::: answer
211

212
              case _ =>
213
                answer
214
            }
215
          }
216

217
        }
218
      // we start with first indices in both sequences
219 1
      loop(low1, low2, high1, high2, Nil).reverse
220
    }
221
  }
222

223
  def savedHashes: Lcs[T] =
224 1
    new HashedLcs(new Patience[Hashed[T]](withFallback))
225

226
}
227

228
private case class Stacked(idx1: Int, idx2: Int, next: Option[Stacked]) {
229
  def chain: List[(Int, Int)] = {
230
    @tailrec
231
    def loop(stacked: Stacked, acc: List[(Int, Int)]): List[(Int, Int)] =
232 1
      stacked.next match {
233
        case Some(next) =>
234 1
          loop(next, (stacked.idx1, stacked.idx2) :: acc)
235
        case None =>
236 1
          (stacked.idx1, stacked.idx2) :: acc
237
      }
238 1
    loop(this, Nil)
239
  }
240
}
241

Read our documentation on viewing source code .

Loading