This source code file is available for download here: TextDiff.bas(~8 KB)

  1: Attribute VB_Name = "TextDiff"
  2: Option Explicit
  3: Public Const DIFF_INSERT = "+"
  4: Public Const DIFF_DELETE = "-"
  5: Public Const DIFF_COPY = "="
  6: Dim D() As Integer
  7: Private Function Min(a As Integer, b As Integer, c As Integer) As Integer
  8:     Dim result As Integer
  9:     result = a
 10:     If b < result Then result = b
 11:     If c < result Then result = c
 12:     Min = result
 13: End Function
 14: ' Based on article http://www.merriampark.com/ld.htm
 15: ' Calculates string "similarity". The value returned is number of insert/delete operations needed to transform
 16: ' string 's' into string 't'
 17: Public Function LevenshteinDistance(ByVal s As String, ByVal t As String) As Integer
 18:     Dim n As Integer    ' length of s
 19:     Dim m As Integer    ' length of t
 20:     Dim i As Integer
 21:     Dim j As Integer
 22:     Dim cs As String
 23:     Dim ct As String
 24:     Dim cost As Integer
 25: 
 26:     ' Step 1
 27:     n = Len(s)
 28:     m = Len(t)
 29:     ReDim D(0 To n, 0 To m) As Integer
 30: 
 31:     ' Step 2
 32:     For i = 0 To n
 33:         D(i, 0) = i
 34:     Next i
 35: 
 36:     For j = 0 To m
 37:         D(0, j) = j
 38:     Next j
 39: 
 40:     ' Step 3
 41:     For i = 1 To n
 42:         cs = Mid(s, i, 1)
 43: 
 44:         ' Step 4
 45:         For j = 1 To m
 46:             ct = Mid(t, j, 1)
 47: 
 48:             ' Step 5
 49:             If cs = ct Then
 50:                 cost = 0
 51:             Else
 52:                 cost = 1
 53:             End If
 54: 
 55:             ' Step 6
 56:             D(i, j) = Min(D(i - 1, j) + 1, D(i, j - 1) + 1, D(i - 1, j - 1) + cost)
 57:         Next j
 58:     Next i
 59: 
 60:     ' Step 7
 61:     LevenshteinDistance = D(n, m)
 62: End Function
 63: Private Function CanGoTo(ByVal i As Integer, ByVal j As Integer, ByVal di As Integer, ByVal dj As Integer, ByRef nexti As Integer, ByRef nextj As Integer, ByRef nextcost As Integer) As Boolean
 64:     Dim ni As Integer
 65:     Dim nj As Integer
 66:     ni = i + di
 67:     nj = j + dj
 68: 
 69:     CanGoTo = False
 70: 
 71:     If (ni >= 0) And (nj >= 0) Then
 72:         If D(ni, nj) <= nextcost Then
 73:             nextcost = D(ni, nj)
 74:             nexti = ni
 75:             nextj = nj
 76:             CanGoTo = True
 77:         End If
 78:     End If
 79: End Function
 80: Private Function GetCodeAt(ByVal commands As String, index As Integer) As String
 81:     GetCodeAt = Mid(commands, 1 + ((index - 1) * 2), 2)
 82: End Function
 83: Private Sub SetCodeAt(ByRef commands As String, index As Integer, ByVal value As String)
 84:     Dim base As Integer
 85:     base = 1 + ((index - 1) * 2)
 86:     commands = Left(commands, base - 1) & value & Mid(commands, base + 2)
 87: End Sub
 88: Private Function RemoveCodeAt(ByRef commands As String, index As Integer) As String
 89:     Dim base As Integer
 90:     base = 1 + ((index - 1) * 2)
 91:     RemoveCodeAt = Mid(commands, base, 2)
 92:     commands = Left(commands, base - 1) & Mid(commands, base + 2)
 93: End Function
 94: Private Sub InsertCodeAt(ByRef commands As String, index As Integer, ByVal value As String)
 95:     Dim base As Integer
 96:     base = 1 + ((index - 1) * 2)
 97:     commands = Left(commands, base - 1) & value & Mid(commands, base)
 98: End Sub
 99: Private Function OperationAt(ByVal commands As String, index As Integer) As String
100:     OperationAt = Mid(GetCodeAt(commands, index), 1, 1)
101: End Function
102: Private Sub MoveCode(ByRef commands As String, oldIndex As Integer, newIndex As Integer)
103:     If oldIndex = newIndex Then Exit Sub
104: 
105:     Dim value As String
106:     value = RemoveCodeAt(commands, oldIndex)
107:     Call InsertCodeAt(commands, newIndex, value)
108: End Sub
109: ' Based on article http://somethinkodd.com/oddthinking/2006/01/16/comparing-strings-an-analysis-of-diff-algorithms/
110: ' Returns sequence of command-value character pairs.
111: ' Valid commands are DIFF_INSERT, DIFF_DELETE, DIFF_COPY
112: Public Function Diff(ByVal u As String, ByVal v As String) As String
113:     Dim i As Integer
114:     Dim j As Integer
115:     Dim result As String
116: 
117:     Dim ni As Integer
118:     Dim nj As Integer
119:     Dim cost As Integer
120:     Dim operation As String
121:     Dim count As Integer
122: 
123:     ' calculate the matrix
124:     Call LevenshteinDistance(u, v)
125:     i = Len(u)
126:     j = Len(v)
127:     result = ""
128:     count = 0
129: 
130:     ' A minimising path on a D matrix is one which at a given element travels up, left or diagonally up-and-left
131:     ' to the minimum of its "preceding" neighbours. If more than one neighbour has an equal value, the choice
132:     ' is arbitrary. The path continues until it reaches the top left corner.
133:     While (i > 0) Or (j > 0)
134:         cost = 32000    ' some very big number
135: 
136:         ' If the path travels from an element D[i,j] to its neighbour above it, D[i-1,j], this represents
137:         ' the deletion of a symbol from the string u at position i.
138:         If CanGoTo(i, j, -1, 0, ni, nj, cost) Then
139:             operation = DIFF_DELETE & Mid(u, i, 1)
140:         End If
141: 
142:         ' If the path travels from an element D[i,j] to its left neighbour, D[i,j-1],
143:         ' this represents the insertion of a symbol v[j] in the string u at position i.
144:         If CanGoTo(i, j, 0, -1, ni, nj, cost) Then
145:             operation = DIFF_INSERT & Mid(v, j, 1)
146:         End If
147: 
148:         ' If the path travels from an element D[i,j] to its upper-left neighbour, D[i-1,j-1], and u[i] = v[j],
149:         ' then this means that the two strings match at this point, and no operation is required.
150:         ' As character replacement (as opposed to deletion and insertion) is not allowed in this model,
151:         ' the path may not travel to the upper-left neighbour if u[i] <> v[j].
152:         If (i > 0) And (j > 0) Then
153:             If (Mid(u, i, 1) = Mid(v, j, 1)) Then
154:                 If CanGoTo(i, j, -1, -1, ni, nj, cost) Then
155:                     operation = DIFF_COPY & Mid(u, i, 1)
156:                 End If
157:             End If
158:         End If
159: 
160:         result = operation & result
161:         count = count + 1
162:         i = ni
163:         j = nj
164:     Wend
165: 
166:     ' at this point we have the 'result' ready
167:     ' now we will group delete/insert operations to make it nicer
168: 
169:     ' we will translate -a+A-b+B-c+C sequences
170:     ' into              -a-b-c+A+B+C
171:     Dim c As Integer
172:     Dim base As Integer
173:     Dim size As Integer
174:     c = 1
175:     While c <= count - 1
176:         If (OperationAt(result, c + 0) = DIFF_DELETE) And _
177:            (OperationAt(result, c + 1) = DIFF_INSERT) Then
178:             ' start of the sequence
179:             base = c
180:             size = 1
181:             Do
182:                 If (OperationAt(result, base + (size * 2) + 0) = DIFF_DELETE) And _
183:                    (OperationAt(result, base + (size * 2) + 1) = DIFF_INSERT) Then
184:                     size = size + 1
185:                 Else
186:                     Exit Do
187:                 End If
188:             Loop
189:             ' now 'size' is the number of consecutive delete/insert pairs
190:             ' 'base' is the start index
191:             Dim newIndex As Integer
192:             Dim oldIndex As Integer
193:             newIndex = base + 1
194:             oldIndex = base + 2
195:             While size > 1
196:                 Call MoveCode(result, oldIndex, newIndex)
197:                 newIndex = newIndex + 1
198:                 oldIndex = oldIndex + 2
199:                 size = size - 1
200:             Wend
201:             c = newIndex
202:         Else
203:             c = c + 1
204:         End If
205:     Wend
206: 
207:     ' we will translate -a+B+C-d sequences
208:     ' into              -a-d+B+C
209:     c = 1
210:     While c <= count - 1
211:         If (OperationAt(result, c + 0) = DIFF_DELETE) And _
212:            (OperationAt(result, c + 1) = DIFF_INSERT) Then
213:             ' possibly start of insert... delete pattern
214:             base = c
215:             size = 1
216:             Do
217:                 If (OperationAt(result, base + size + 1) = DIFF_INSERT) Then
218:                     size = size + 1
219:                 Else
220:                     Exit Do
221:                 End If
222:             Loop
223: 
224:             ' now 'size' is the number of consecutive inserts
225:             If (OperationAt(result, base + size + 1) = DIFF_DELETE) Then
226:                 ' we have delete operation at 'base' and at 'base + size + 1'
227:                 ' in between there are inserts. lets merge those blocks of deletes
228:                 Do
229:                     Call MoveCode(result, base + size + 1, base + 1)
230:                     base = base + 1
231:                     If OperationAt(result, base + size + 1) <> DIFF_DELETE Then
232:                         c = base + size + 1
233:                         Exit Do
234:                     End If
235:                 Loop
236:             Else
237:                 c = base + size + 1
238:             End If
239:         Else
240:             c = c + 1
241:         End If
242:     Wend
243: 
244:     Diff = result
245: End Function

Do you have questions, comments, feedback?