Pascal Diff, Version 0.4a

Datei: Diff.pas

{
  Diff.pas

  Object pascal class to compare two string arrays (resp. text files) in the
  way of the classical unix standard ``diff´´ program. See the included file
  INFO.txt for information on how to use it.

  Version 0.4a - always find the most current version at
  http://flocke.vssd.de/prog/code/pascal/pasdiff/

  Copyright (C) 2005, 2006 Volker Siebert <flocke@vssd.de>
  All rights reserved.

  Permission is hereby granted, free of charge, to any person obtaining a
  copy of this software and associated documentation files (the "Software"),
  to deal in the Software without restriction, including without limitation
  the rights to use, copy, modify, merge, publish, distribute, sublicense,
  and/or sell copies of the Software, and to permit persons to whom the
  Software is furnished to do so, subject to the following conditions:

  The above copyright notice and this permission notice shall be included in
  all copies or substantial portions of the Software.

  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  DEALINGS IN THE SOFTWARE.
}

unit Diff;

interface

uses
  SysUtils, Classes;

type
  { TDiffHashValue is the result value of the hash function (unsigned!)
  }
  TDiffHashValue = Cardinal;

  { This record holds a range of lines. "Start" is the zero based starting
    line number and "Stop" the zero based number of the first line *BEHIND*
    the range. So, if Start and Stop are equal, you know the position but
    the range is empty.
  }
  TRangeOfLines = record
    Start: integer;
    Stop: integer;
  end;

  { A TDifference is a found difference between the first (left) and the
    second (right) string array.
  }
  PDifference = ^TDifference;
  TDifference = record
    Left: TRangeOfLines;
    Right: TRangeOfLines;
  end;

  PDifferenceArray = ^TDifferenceArray;
  TDifferenceArray = array [0 .. MaxInt div 32] of TDifference;

  { Exception class used by this module.
  }
  ETextComparerError = class(Exception);

  TTextComparerHash = function(const s: string): TDiffHashValue of object; register;
  TTextComparerCompare = function(const s1, s2: string): boolean of object; register;

  { Text comparer base class.
  }
  TCustomTextComparer = class
  private
    FDiffs: PDifferenceArray;
    FCount: integer;
    FAlloc: integer;
    FHeuristic: integer;
    FOnHash: TTextComparerHash;
    FOnCompare: TTextComparerCompare;
    function GetDiff(Index: integer): TDifference;
    procedure SetSize(NewCount: integer);
  protected
    function DefaultHash(const s: string): TDiffHashValue; register;
    function DefaultCompare(const s1, s2: string): boolean; register;
    property OnHash: TTextComparerHash read FOnHash write FOnHash;
    property OnCompare: TTextComparerCompare read FOnCompare write FOnCompare;
  public
    constructor Create;
    destructor Destroy; override;
    procedure Clear;
    procedure Compare(List1, List2: TStrings);
    property Count: integer read FCount;
    property Diff[Index: integer]: TDifference read GetDiff; default;
    property Heuristic: integer read FHeuristic write FHeuristic;
  end;

  { Text comparer public class with access to the user defined hash and
    compare methods.
  }
  TTextComparer = class(TCustomTextComparer)
  public
    property OnHash;
    property OnCompare;
  end;

  { A standard text comparer that can ignore case and whitespace.
    Note that OnHash and OnCompare are still protected, because "Compare"
    will override them anyway.
  }
  TStandardTextComparer = class(TCustomTextComparer)
  private
    FIgnoreCase: boolean;
    FIgnoreSpaces: boolean;
    FSpacesAsOne: boolean;
  protected
    function Hash_ynn(const s: string): TDiffHashValue; register;
    function Hash_nyn(const s: string): TDiffHashValue; register;
    function Hash_yyn(const s: string): TDiffHashValue; register;
    function Hash_nny(const s: string): TDiffHashValue; register;
    function Hash_yny(const s: string): TDiffHashValue; register;
    function Compare_ynn(const s1, s2: string): boolean; register;
    function Compare_nyn(const s1, s2: string): boolean; register;
    function Compare_yyn(const s1, s2: string): boolean; register;
    function Compare_nny(const s1, s2: string): boolean; register;
    function Compare_yny(const s1, s2: string): boolean; register;
  public
    procedure Compare(List1, List2: TStrings);
    property IgnoreCase: boolean read FIgnoreCase write FIgnoreCase;
    property IgnoreSpaces: boolean read FIgnoreSpaces write FIgnoreSpaces;
    property SpacesAsOne: boolean read FSpacesAsOne write FSpacesAsOne;
  end;

implementation

constructor TCustomTextComparer.Create;
begin
  inherited;
  FDiffs := nil;
  FCount := 0;
  FAlloc := 0;
  FHeuristic := 0;
end;

destructor TCustomTextComparer.Destroy;
begin
  Clear;
  inherited;
end;

{ Internal routine that sets the size of the difference array.
}
procedure TCustomTextComparer.SetSize(NewCount: integer);
begin
  FAlloc := NewCount;
  ReAllocMem(FDiffs, FAlloc * SizeOf(TDifference));
  if FCount > FAlloc then
    FCount := FAlloc;
end;

procedure TCustomTextComparer.Clear;
begin
  SetSize(0);
end;

function TCustomTextComparer.GetDiff(Index: integer): TDifference;
begin
  if (Index < 0) or (Index >= FCount) then
    raise ETextComparerError.Create('Invalid item index');

  Result := FDiffs^[Index];
end;

{ The default hash just calculates a simple base 19 checksum over the
  ascii codes of the character string.
}
function TCustomTextComparer.DefaultHash(const s: string): TDiffHashValue;
var
  k, len: integer;
begin
  len := Length(s);
  Result := len;
  for k := 1 to len do
    Result := 19 * Result + Ord(s[k]);
end;

{ The default compare function just compares the two strings literally.
}
function TCustomTextComparer.DefaultCompare(const s1, s2: string): boolean;
begin
  Result := s1 = s2;
end;

{----------------------------------------------------------------------}

{ This is the big Mojo that does everything. The algorithm is:
  1. Inside a given range of lines from the left and the right file,
     find the largest block of equal lines.
  2. If none is found, mark the range as being different.
  3. If one is found, recursively check the two ranges before and
     after that block for differences.
}
procedure TCustomTextComparer.Compare(List1, List2: TStrings);
type
  TFileInfo = record
    Strings: TStrings;
    Count: integer;
    LineHash: array of TDiffHashValue;    // [0 ..     Count - 1]
    HashLine: array of integer;           // [0 .. 2 * Count - 1]
    HashColl: array of integer;           // [0 .. 2 * Count - 1]
  end;

var
  HashFunc: TTextComparerHash;
  CompareFunc: TTextComparerCompare;
  Info1, Info2: TFileInfo;

  procedure InitFileInfo(var Info: TFileInfo; AStrings: TStrings);
  var
    idx, back, coll: Integer;
  begin
    with Info do
    begin
      Strings := AStrings;
      Count := Strings.Count;
      SetLength(LineHash, Count);
      SetLength(HashLine, 2 * Count);
      SetLength(HashColl, 2 * Count);

      if Count < 1 then
        exit;

      // The upper half of HashLine/HashColl does not need to be initialized.
      for idx := 0 to Count - 1 do
      begin
        LineHash[idx] := HashFunc(Strings[idx]);
        HashLine[idx] := -1;
        HashColl[idx] := -1;
      end;

      coll := Count - 1;
      for idx := 0 to Count - 1 do
      begin
        back := integer(Cardinal(LineHash[idx]) mod Cardinal(Count));
        if HashLine[back] < 0 then
          HashLine[back] := idx
        else
        begin
          inc(coll);
          HashLine[coll] := HashLine[back];
          HashColl[coll] := HashColl[back];
          HashLine[back] := idx;
          HashColl[back] := coll;
        end;
      end;
    end;
  end;

  procedure AddDiff(l1a, l1e, l2a, l2e: integer);
  begin
    if (l1a >= l1e) and (l2a >= l2e) then
      exit;

    if FCount >= FAlloc then
      SetSize(FAlloc + 32);

    with FDiffs^[FCount] do
    begin
      Left.Start := l1a;
      Left.Stop := l1e;
      Right.Start := l2a;
      Right.Stop := l2e;
    end;

    inc(FCount);
  end;

  function DistanceLess(l1a, l1e, l2a, l2e: integer;
                        bp1, bp2, np1, np2: integer): boolean;
  var
    t1, b1, t2, b2: integer;
  begin
    t1 := abs((bp1 - l1a) - (bp2 - l2a));
    b1 := abs((l1e - bp1) - (l2e - bp2));
    t2 := abs((np1 - l2a) - (np2 - l2a));
    b2 := abs((l2e - np1) - (l2e - np2));
    Result := (t2 + b2) < (t1 + b1);
  end;

  function CountEqualLines(l1p, l1e, l2p, l2e, bsz: integer): integer;
  var
    max: integer;
  begin
    // Prescan: compare the hashcodes (Index 1 has already been compared)
    max := 1;
    while (l1p + max < l1e) and
          (l2p + max < l2e) and
          (Info1.LineHash[l1p + max] = Info2.LineHash[l2p + max]) do
      inc(max);

    // Better match possible?
    Result := 0;
    if max < bsz then
      exit;

    // Final scan: really compare the strings
    while (Result < max) and
          CompareFunc(Info1.Strings[l1p], Info2.Strings[l2p]) do
    begin
      inc(l1p);
      inc(l2p);
      inc(Result);
    end;
  end;

  procedure DiffRange(l1a, l1e, l2a, l2e: integer);
  var
    bp1, bp2, bsz, scan, idx, pos, cnt: integer;
  begin
    while true do
    begin
      if (l1a >= l1e) or (l2a >= l2e) then
        break;

      bp1 := -1;
      bp2 := -1;
      bsz := 0;

      if l1e - l1a < l2e - l2a then
      begin
        // Block 1 is smaller
        scan := l1a;
        while scan < l1e - bsz do
        begin
          idx := integer(Cardinal(Info1.LineHash[scan]) mod Cardinal(Info2.Count));
          repeat
            pos := Info2.HashLine[idx];

            // Trick: the lines have been added from 0 up to Count-1. So, for
            // colliding hashes, the lines are stored in REVERSE order. Thus,
            // when we find a line BEFORE l2a, we're done with this block.
            // This also catches the -1 at the end of the queue.
            if pos < l2a then
              break;

            if pos + bsz <= l2e then
              if Info1.LineHash[scan] = Info2.LineHash[pos] then
                if (bsz = 0) or
                   (Info1.LineHash[scan + bsz - 1] = Info2.LineHash[pos + bsz - 1]) then
                begin
                  cnt := CountEqualLines(scan, l1e, pos, l2e, bsz);
                  if (cnt > bsz) or
                     ((cnt = bsz) and DistanceLess(l1a, l1e, l2a, l2e, bp1, bp2, scan, pos)) then
                  begin
                    bsz := cnt;
                    bp1 := scan;
                    bp2 := pos;

                    if (FHeuristic > 0) and (bsz >= FHeuristic) then
                      break;
                  end;
                end;

            idx := Info2.HashColl[idx];
          until idx < 0;

          if (FHeuristic > 0) and (bsz >= FHeuristic) then
            break;

          inc(scan);
        end;
      end
      else
      begin
        // Block 2 is smaller
        scan := l2a;
        while scan < l2e - bsz do
        begin
          idx := integer(Cardinal(Info2.LineHash[scan]) mod Cardinal(Info1.Count));
          repeat
            pos := Info1.HashLine[idx];

            // Trick: the lines have been added from 0 up to Count-1. So, for
            // colliding hashes, the lines are stored in REVERSE order. Thus,
            // when we find a line BEFORE l1a, we're done with this block.
            // This also catches the -1 at the end of the queue.
            if pos < l1a then
              break;

            if pos + bsz <= l1e then
              if Info1.LineHash[pos] = Info2.LineHash[scan] then
                if (bsz = 0) or
                   (Info1.LineHash[pos + bsz - 1] = Info2.LineHash[scan + bsz - 1]) then
                begin
                  cnt := CountEqualLines(pos, l1e, scan, l2e, bsz);
                  if (cnt > bsz) or
                     ((cnt = bsz) and DistanceLess(l1a, l1e, l2a, l2e, bp1, bp2, pos, scan)) then
                  begin
                    bsz := cnt;
                    bp1 := pos;
                    bp2 := scan;

                    if (FHeuristic > 0) and (bsz >= FHeuristic) then
                      break;
                  end;
                end;

            idx := Info1.HashColl[idx];
          until idx < 0;

          if (FHeuristic > 0) and (bsz >= FHeuristic) then
            break;

          inc(scan);
        end;
      end;

      if bsz = 0 then
        break;

      // This way the diffs are always sorted correctly
      DiffRange(l1a, bp1, l2a, bp2);
      l1a := bp1 + bsz;
      l2a := bp2 + bsz;

      // This way the recursion would be optimized, but we had to
      // sort the diffs after all passes
      {
      if (bp1 - l1a) + (bp2 - l2a) < (l1e - bp1 - bsz) + (l2e - bp2 - bsz) then
      begin
        DiffRange(l1a, bp1, l2a, bp2);
        l1a := bp1 + bsz;
        l2a := bp2 + bsz;
      end
      else
      begin
        DiffRange(bp1 + bsz, l1e, bp2 + bsz, l2e);
        l1e := bp1;
        l2e := bp2;
      end;
      }
    end;

    AddDiff(l1a, l1e, l2a, l2e);
  end;

begin
  Clear;

  if Assigned(FOnHash) and Assigned(FOnCompare) then
  begin
    HashFunc := FOnHash;
    CompareFunc := FOnCompare;
  end
  else
  begin
    HashFunc := DefaultHash;
    CompareFunc := DefaultCompare;
  end;

  // Delphi does Initialize/Finalize, no need to protect with try-finally
  InitFileInfo(Info1, List1);
  InitFileInfo(Info2, List2);
  DiffRange(0, Info1.Count, 0, Info2.Count);
end;

{----------------------------------------------------------------------}

const
  WhiteSpace = [#0, #9, #10, #13, #32];

{ Remove all whitespace characters from the given string.
}
function RemoveSpaces(const s: string): string;
var
  len, p: integer;
  pc: PChar;
begin
  len := Length(s);
  SetLength(Result, len);

  pc := @s[1];
  p := 0;
  while len > 0 do
  begin
    if not (pc^ in WhiteSpace) then
    begin
      inc(p);
      Result[p] := pc^;
    end;

    inc(pc);
    dec(len);
  end;

  SetLength(Result, p);
end;

{ Remove all leading and trailing whitespace from the given string and
  also reduce all sequences of whitespace characters in the middle to a
  single space character.
}
function ReduceSpaces(const s: string): string;
var
  len, p, sp: integer;
  pc: PChar;
begin
  len := Length(s);
  SetLength(Result, len);

  pc := @s[1];
  p := 0;
  sp := 0;
  while len > 0 do
  begin
    if pc^ in WhiteSpace then
      inc(sp, p)
    else
    begin
      if sp > 0 then
      begin
        sp := 0;
        inc(p);
        Result[p] := ' ';
      end;

      inc(p);
      Result[p] := pc^;
    end;

    inc(pc);
    dec(len);
  end;

  SetLength(Result, p);
end;

{----------------------------------------------------------------------}

{ Five different hash and compare functions for all possible combinations
  of IgnoreCase, IgnoreSpaces, and SpacesAsOne. Naming:

     Hash_abc     a = IgnoreCase (y/n)
  Compare_abc     b = IgnoreSpaces (y/n)
                  c = SpacesAsOne (y/n)
}
function TStandardTextComparer.Hash_ynn(const s: string): TDiffHashValue;
begin
  Result := DefaultHash(Lowercase(s));
end;

function TStandardTextComparer.Hash_nyn(const s: string): TDiffHashValue;
begin
  Result := DefaultHash(RemoveSpaces(s));
end;

function TStandardTextComparer.Hash_yyn(const s: string): TDiffHashValue;
begin
  Result := DefaultHash(Lowercase(RemoveSpaces(s)));
end;

function TStandardTextComparer.Hash_nny(const s: string): TDiffHashValue;
begin
  Result := DefaultHash(ReduceSpaces(s));
end;

function TStandardTextComparer.Hash_yny(const s: string): TDiffHashValue;
begin
  Result := DefaultHash(Lowercase(ReduceSpaces(s)));
end;

function TStandardTextComparer.Compare_ynn(const s1, s2: string): boolean;
begin
  Result := Lowercase(s1) = Lowercase(s2);
end;

function TStandardTextComparer.Compare_nyn(const s1, s2: string): boolean;
begin
  Result := RemoveSpaces(s1) = RemoveSpaces(s2);
end;

function TStandardTextComparer.Compare_yyn(const s1, s2: string): boolean;
begin
  Result := Lowercase(RemoveSpaces(s1)) = Lowercase(RemoveSpaces(s2));
end;

function TStandardTextComparer.Compare_nny(const s1, s2: string): boolean;
begin
  Result := ReduceSpaces(s1) = ReduceSpaces(s2);
end;

function TStandardTextComparer.Compare_yny(const s1, s2: string): boolean;
begin
  Result := Lowercase(ReduceSpaces(s1)) = Lowercase(ReduceSpaces(s2));
end;

{ The new compare function just sets OnHash and OnCompare according to the
  settings of IgnoreCase, IgnoreSpaces and SpacesAsOne and calls the old
  compare function to do the work.
}
procedure TStandardTextComparer.Compare(List1, List2: TStrings);
var
  HashFunc: TTextComparerHash;
  CompareFunc: TTextComparerCompare;
begin
  HashFunc := FOnHash;
  CompareFunc := FOnCompare;

  try
    case 4 * Ord(FIgnoreCase) + 2 * Ord(FIgnoreSpaces) + Ord(FSpacesAsOne) of
      0 + 0 + 0: begin
        // Use the default hash/compare functions
        FOnHash := nil;
        FOnCompare := nil;
      end;
      0 + 0 + 1: begin
        FOnHash := Hash_nny;
        FOnCompare := Compare_nny;
      end;
      0 + 2 + 0,
      0 + 2 + 1: begin
        FOnHash := Hash_nyn;
        FOnCompare := Compare_nyn;
      end;
      4 + 0 + 0: begin
        FOnHash := Hash_ynn;
        FOnCompare := Compare_ynn;
      end;
      4 + 0 + 1: begin
        FOnHash := Hash_yny;
        FOnCompare := Compare_yny;
      end;
      4 + 2 + 0,
      4 + 2 + 1: begin
        FOnHash := Hash_yyn;
        FOnCompare := Compare_yyn;
      end;
    end;

    inherited Compare(List1, List2);
  finally
    FOnHash := HashFunc;
    FOnCompare := CompareFunc;
  end;
end;

end.