{
Diff.pas
Object pascal class to compare two string arrays (resp. text files) in the
way of the classical unix standard ``diff´´ program. See the included file
INFO.txt for information on how to use it.
Version 0.4a - always find the most current version at
http://flocke.vssd.de/prog/code/pascal/pasdiff/
Copyright (C) 2005, 2006 Volker Siebert <flocke@vssd.de>
All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
}
unit Diff;
interface
uses
SysUtils, Classes;
type
{ TDiffHashValue is the result value of the hash function (unsigned!)
}
TDiffHashValue = Cardinal;
{ This record holds a range of lines. "Start" is the zero based starting
line number and "Stop" the zero based number of the first line *BEHIND*
the range. So, if Start and Stop are equal, you know the position but
the range is empty.
}
TRangeOfLines = record
Start: integer;
Stop: integer;
end;
{ A TDifference is a found difference between the first (left) and the
second (right) string array.
}
PDifference = ^TDifference;
TDifference = record
Left: TRangeOfLines;
Right: TRangeOfLines;
end;
PDifferenceArray = ^TDifferenceArray;
TDifferenceArray = array [0 .. MaxInt div 32] of TDifference;
{ Exception class used by this module.
}
ETextComparerError = class(Exception);
TTextComparerHash = function(const s: string): TDiffHashValue of object; register;
TTextComparerCompare = function(const s1, s2: string): boolean of object; register;
{ Text comparer base class.
}
TCustomTextComparer = class
private
FDiffs: PDifferenceArray;
FCount: integer;
FAlloc: integer;
FHeuristic: integer;
FOnHash: TTextComparerHash;
FOnCompare: TTextComparerCompare;
function GetDiff(Index: integer): TDifference;
procedure SetSize(NewCount: integer);
protected
function DefaultHash(const s: string): TDiffHashValue; register;
function DefaultCompare(const s1, s2: string): boolean; register;
property OnHash: TTextComparerHash read FOnHash write FOnHash;
property OnCompare: TTextComparerCompare read FOnCompare write FOnCompare;
public
constructor Create;
destructor Destroy; override;
procedure Clear;
procedure Compare(List1, List2: TStrings);
property Count: integer read FCount;
property Diff[Index: integer]: TDifference read GetDiff; default;
property Heuristic: integer read FHeuristic write FHeuristic;
end;
{ Text comparer public class with access to the user defined hash and
compare methods.
}
TTextComparer = class(TCustomTextComparer)
public
property OnHash;
property OnCompare;
end;
{ A standard text comparer that can ignore case and whitespace.
Note that OnHash and OnCompare are still protected, because "Compare"
will override them anyway.
}
TStandardTextComparer = class(TCustomTextComparer)
private
FIgnoreCase: boolean;
FIgnoreSpaces: boolean;
FSpacesAsOne: boolean;
protected
function Hash_ynn(const s: string): TDiffHashValue; register;
function Hash_nyn(const s: string): TDiffHashValue; register;
function Hash_yyn(const s: string): TDiffHashValue; register;
function Hash_nny(const s: string): TDiffHashValue; register;
function Hash_yny(const s: string): TDiffHashValue; register;
function Compare_ynn(const s1, s2: string): boolean; register;
function Compare_nyn(const s1, s2: string): boolean; register;
function Compare_yyn(const s1, s2: string): boolean; register;
function Compare_nny(const s1, s2: string): boolean; register;
function Compare_yny(const s1, s2: string): boolean; register;
public
procedure Compare(List1, List2: TStrings);
property IgnoreCase: boolean read FIgnoreCase write FIgnoreCase;
property IgnoreSpaces: boolean read FIgnoreSpaces write FIgnoreSpaces;
property SpacesAsOne: boolean read FSpacesAsOne write FSpacesAsOne;
end;
implementation
constructor TCustomTextComparer.Create;
begin
inherited;
FDiffs := nil;
FCount := 0;
FAlloc := 0;
FHeuristic := 0;
end;
destructor TCustomTextComparer.Destroy;
begin
Clear;
inherited;
end;
{ Internal routine that sets the size of the difference array.
}
procedure TCustomTextComparer.SetSize(NewCount: integer);
begin
FAlloc := NewCount;
ReAllocMem(FDiffs, FAlloc * SizeOf(TDifference));
if FCount > FAlloc then
FCount := FAlloc;
end;
procedure TCustomTextComparer.Clear;
begin
SetSize(0);
end;
function TCustomTextComparer.GetDiff(Index: integer): TDifference;
begin
if (Index < 0) or (Index >= FCount) then
raise ETextComparerError.Create('Invalid item index');
Result := FDiffs^[Index];
end;
{ The default hash just calculates a simple base 19 checksum over the
ascii codes of the character string.
}
function TCustomTextComparer.DefaultHash(const s: string): TDiffHashValue;
var
k, len: integer;
begin
len := Length(s);
Result := len;
for k := 1 to len do
Result := 19 * Result + Ord(s[k]);
end;
{ The default compare function just compares the two strings literally.
}
function TCustomTextComparer.DefaultCompare(const s1, s2: string): boolean;
begin
Result := s1 = s2;
end;
{----------------------------------------------------------------------}
{ This is the big Mojo that does everything. The algorithm is:
1. Inside a given range of lines from the left and the right file,
find the largest block of equal lines.
2. If none is found, mark the range as being different.
3. If one is found, recursively check the two ranges before and
after that block for differences.
}
procedure TCustomTextComparer.Compare(List1, List2: TStrings);
type
TFileInfo = record
Strings: TStrings;
Count: integer;
LineHash: array of TDiffHashValue; // [0 .. Count - 1]
HashLine: array of integer; // [0 .. 2 * Count - 1]
HashColl: array of integer; // [0 .. 2 * Count - 1]
end;
var
HashFunc: TTextComparerHash;
CompareFunc: TTextComparerCompare;
Info1, Info2: TFileInfo;
procedure InitFileInfo(var Info: TFileInfo; AStrings: TStrings);
var
idx, back, coll: Integer;
begin
with Info do
begin
Strings := AStrings;
Count := Strings.Count;
SetLength(LineHash, Count);
SetLength(HashLine, 2 * Count);
SetLength(HashColl, 2 * Count);
if Count < 1 then
exit;
// The upper half of HashLine/HashColl does not need to be initialized.
for idx := 0 to Count - 1 do
begin
LineHash[idx] := HashFunc(Strings[idx]);
HashLine[idx] := -1;
HashColl[idx] := -1;
end;
coll := Count - 1;
for idx := 0 to Count - 1 do
begin
back := integer(Cardinal(LineHash[idx]) mod Cardinal(Count));
if HashLine[back] < 0 then
HashLine[back] := idx
else
begin
inc(coll);
HashLine[coll] := HashLine[back];
HashColl[coll] := HashColl[back];
HashLine[back] := idx;
HashColl[back] := coll;
end;
end;
end;
end;
procedure AddDiff(l1a, l1e, l2a, l2e: integer);
begin
if (l1a >= l1e) and (l2a >= l2e) then
exit;
if FCount >= FAlloc then
SetSize(FAlloc + 32);
with FDiffs^[FCount] do
begin
Left.Start := l1a;
Left.Stop := l1e;
Right.Start := l2a;
Right.Stop := l2e;
end;
inc(FCount);
end;
function DistanceLess(l1a, l1e, l2a, l2e: integer;
bp1, bp2, np1, np2: integer): boolean;
var
t1, b1, t2, b2: integer;
begin
t1 := abs((bp1 - l1a) - (bp2 - l2a));
b1 := abs((l1e - bp1) - (l2e - bp2));
t2 := abs((np1 - l2a) - (np2 - l2a));
b2 := abs((l2e - np1) - (l2e - np2));
Result := (t2 + b2) < (t1 + b1);
end;
function CountEqualLines(l1p, l1e, l2p, l2e, bsz: integer): integer;
var
max: integer;
begin
// Prescan: compare the hashcodes (Index 1 has already been compared)
max := 1;
while (l1p + max < l1e) and
(l2p + max < l2e) and
(Info1.LineHash[l1p + max] = Info2.LineHash[l2p + max]) do
inc(max);
// Better match possible?
Result := 0;
if max < bsz then
exit;
// Final scan: really compare the strings
while (Result < max) and
CompareFunc(Info1.Strings[l1p], Info2.Strings[l2p]) do
begin
inc(l1p);
inc(l2p);
inc(Result);
end;
end;
procedure DiffRange(l1a, l1e, l2a, l2e: integer);
var
bp1, bp2, bsz, scan, idx, pos, cnt: integer;
begin
while true do
begin
if (l1a >= l1e) or (l2a >= l2e) then
break;
bp1 := -1;
bp2 := -1;
bsz := 0;
if l1e - l1a < l2e - l2a then
begin
// Block 1 is smaller
scan := l1a;
while scan < l1e - bsz do
begin
idx := integer(Cardinal(Info1.LineHash[scan]) mod Cardinal(Info2.Count));
repeat
pos := Info2.HashLine[idx];
// Trick: the lines have been added from 0 up to Count-1. So, for
// colliding hashes, the lines are stored in REVERSE order. Thus,
// when we find a line BEFORE l2a, we're done with this block.
// This also catches the -1 at the end of the queue.
if pos < l2a then
break;
if pos + bsz <= l2e then
if Info1.LineHash[scan] = Info2.LineHash[pos] then
if (bsz = 0) or
(Info1.LineHash[scan + bsz - 1] = Info2.LineHash[pos + bsz - 1]) then
begin
cnt := CountEqualLines(scan, l1e, pos, l2e, bsz);
if (cnt > bsz) or
((cnt = bsz) and DistanceLess(l1a, l1e, l2a, l2e, bp1, bp2, scan, pos)) then
begin
bsz := cnt;
bp1 := scan;
bp2 := pos;
if (FHeuristic > 0) and (bsz >= FHeuristic) then
break;
end;
end;
idx := Info2.HashColl[idx];
until idx < 0;
if (FHeuristic > 0) and (bsz >= FHeuristic) then
break;
inc(scan);
end;
end
else
begin
// Block 2 is smaller
scan := l2a;
while scan < l2e - bsz do
begin
idx := integer(Cardinal(Info2.LineHash[scan]) mod Cardinal(Info1.Count));
repeat
pos := Info1.HashLine[idx];
// Trick: the lines have been added from 0 up to Count-1. So, for
// colliding hashes, the lines are stored in REVERSE order. Thus,
// when we find a line BEFORE l1a, we're done with this block.
// This also catches the -1 at the end of the queue.
if pos < l1a then
break;
if pos + bsz <= l1e then
if Info1.LineHash[pos] = Info2.LineHash[scan] then
if (bsz = 0) or
(Info1.LineHash[pos + bsz - 1] = Info2.LineHash[scan + bsz - 1]) then
begin
cnt := CountEqualLines(pos, l1e, scan, l2e, bsz);
if (cnt > bsz) or
((cnt = bsz) and DistanceLess(l1a, l1e, l2a, l2e, bp1, bp2, pos, scan)) then
begin
bsz := cnt;
bp1 := pos;
bp2 := scan;
if (FHeuristic > 0) and (bsz >= FHeuristic) then
break;
end;
end;
idx := Info1.HashColl[idx];
until idx < 0;
if (FHeuristic > 0) and (bsz >= FHeuristic) then
break;
inc(scan);
end;
end;
if bsz = 0 then
break;
// This way the diffs are always sorted correctly
DiffRange(l1a, bp1, l2a, bp2);
l1a := bp1 + bsz;
l2a := bp2 + bsz;
// This way the recursion would be optimized, but we had to
// sort the diffs after all passes
{
if (bp1 - l1a) + (bp2 - l2a) < (l1e - bp1 - bsz) + (l2e - bp2 - bsz) then
begin
DiffRange(l1a, bp1, l2a, bp2);
l1a := bp1 + bsz;
l2a := bp2 + bsz;
end
else
begin
DiffRange(bp1 + bsz, l1e, bp2 + bsz, l2e);
l1e := bp1;
l2e := bp2;
end;
}
end;
AddDiff(l1a, l1e, l2a, l2e);
end;
begin
Clear;
if Assigned(FOnHash) and Assigned(FOnCompare) then
begin
HashFunc := FOnHash;
CompareFunc := FOnCompare;
end
else
begin
HashFunc := DefaultHash;
CompareFunc := DefaultCompare;
end;
// Delphi does Initialize/Finalize, no need to protect with try-finally
InitFileInfo(Info1, List1);
InitFileInfo(Info2, List2);
DiffRange(0, Info1.Count, 0, Info2.Count);
end;
{----------------------------------------------------------------------}
const
WhiteSpace = [#0, #9, #10, #13, #32];
{ Remove all whitespace characters from the given string.
}
function RemoveSpaces(const s: string): string;
var
len, p: integer;
pc: PChar;
begin
len := Length(s);
SetLength(Result, len);
pc := @s[1];
p := 0;
while len > 0 do
begin
if not (pc^ in WhiteSpace) then
begin
inc(p);
Result[p] := pc^;
end;
inc(pc);
dec(len);
end;
SetLength(Result, p);
end;
{ Remove all leading and trailing whitespace from the given string and
also reduce all sequences of whitespace characters in the middle to a
single space character.
}
function ReduceSpaces(const s: string): string;
var
len, p, sp: integer;
pc: PChar;
begin
len := Length(s);
SetLength(Result, len);
pc := @s[1];
p := 0;
sp := 0;
while len > 0 do
begin
if pc^ in WhiteSpace then
inc(sp, p)
else
begin
if sp > 0 then
begin
sp := 0;
inc(p);
Result[p] := ' ';
end;
inc(p);
Result[p] := pc^;
end;
inc(pc);
dec(len);
end;
SetLength(Result, p);
end;
{----------------------------------------------------------------------}
{ Five different hash and compare functions for all possible combinations
of IgnoreCase, IgnoreSpaces, and SpacesAsOne. Naming:
Hash_abc a = IgnoreCase (y/n)
Compare_abc b = IgnoreSpaces (y/n)
c = SpacesAsOne (y/n)
}
function TStandardTextComparer.Hash_ynn(const s: string): TDiffHashValue;
begin
Result := DefaultHash(Lowercase(s));
end;
function TStandardTextComparer.Hash_nyn(const s: string): TDiffHashValue;
begin
Result := DefaultHash(RemoveSpaces(s));
end;
function TStandardTextComparer.Hash_yyn(const s: string): TDiffHashValue;
begin
Result := DefaultHash(Lowercase(RemoveSpaces(s)));
end;
function TStandardTextComparer.Hash_nny(const s: string): TDiffHashValue;
begin
Result := DefaultHash(ReduceSpaces(s));
end;
function TStandardTextComparer.Hash_yny(const s: string): TDiffHashValue;
begin
Result := DefaultHash(Lowercase(ReduceSpaces(s)));
end;
function TStandardTextComparer.Compare_ynn(const s1, s2: string): boolean;
begin
Result := Lowercase(s1) = Lowercase(s2);
end;
function TStandardTextComparer.Compare_nyn(const s1, s2: string): boolean;
begin
Result := RemoveSpaces(s1) = RemoveSpaces(s2);
end;
function TStandardTextComparer.Compare_yyn(const s1, s2: string): boolean;
begin
Result := Lowercase(RemoveSpaces(s1)) = Lowercase(RemoveSpaces(s2));
end;
function TStandardTextComparer.Compare_nny(const s1, s2: string): boolean;
begin
Result := ReduceSpaces(s1) = ReduceSpaces(s2);
end;
function TStandardTextComparer.Compare_yny(const s1, s2: string): boolean;
begin
Result := Lowercase(ReduceSpaces(s1)) = Lowercase(ReduceSpaces(s2));
end;
{ The new compare function just sets OnHash and OnCompare according to the
settings of IgnoreCase, IgnoreSpaces and SpacesAsOne and calls the old
compare function to do the work.
}
procedure TStandardTextComparer.Compare(List1, List2: TStrings);
var
HashFunc: TTextComparerHash;
CompareFunc: TTextComparerCompare;
begin
HashFunc := FOnHash;
CompareFunc := FOnCompare;
try
case 4 * Ord(FIgnoreCase) + 2 * Ord(FIgnoreSpaces) + Ord(FSpacesAsOne) of
0 + 0 + 0: begin
// Use the default hash/compare functions
FOnHash := nil;
FOnCompare := nil;
end;
0 + 0 + 1: begin
FOnHash := Hash_nny;
FOnCompare := Compare_nny;
end;
0 + 2 + 0,
0 + 2 + 1: begin
FOnHash := Hash_nyn;
FOnCompare := Compare_nyn;
end;
4 + 0 + 0: begin
FOnHash := Hash_ynn;
FOnCompare := Compare_ynn;
end;
4 + 0 + 1: begin
FOnHash := Hash_yny;
FOnCompare := Compare_yny;
end;
4 + 2 + 0,
4 + 2 + 1: begin
FOnHash := Hash_yyn;
FOnCompare := Compare_yyn;
end;
end;
inherited Compare(List1, List2);
finally
FOnHash := HashFunc;
FOnCompare := CompareFunc;
end;
end;
end. |