Program DeDupAndSort;
{$H+}
Uses Classes, SysUtils;
Const
CRLF : String = #13#10;
Type
PInt64 = ^Int64;
//Helper class to read and write lines
TStringIO = Class Helper For TFileStream
Function ReadLine: RawByteString;
Procedure WriteLine(Const S: String);
End;
Function TStringIO.ReadLine: RawByteString;
Var C: Char;
Begin
//Helper class ReadLine
Result:= '';
While Position < Size Do Begin
Read(C, SizeOf(C));
If C = #10 Then Exit
Else If C <> #13 Then Result:= Result + C;
End;
End;
Procedure TStringIO.WriteLine(Const S: String);
Var L: Integer;
Begin
//Helper class WriteLine
L:= Length(S);
If L > 0 Then Write(S[1], L);
Write(CRLF[1], Length(CRLF));
End;
//Global variables
Var
InFile, OutFile: TFileStream;
OutIndex: TList;
FileName, CurWord: String;
InsertAt: Integer;
Tick, MemUsed: Int64;
Procedure MakeRandomFile;
Const cLetters : String = 'abcdefghijklmnopqrstuvxyzABCDEFGHIJKLMNOPQRSTUVXYZ';
cWordCount = 1000000;
cMinWordLen = 5;
cMaxWordLen = 30;
Var W: String;
I, WL, WI: Integer;
Begin
//Create a file of random words where every 10th word is duplicated
Randomize;
With TFileStream.Create(FileName, fmCreate) Do Try
For I:= 1 To cWordCount Do Begin
If I Mod 10 <> 0 Then Begin
W:= '';
WL:= Random(cMaxWordLen - cMinWordLen) + cMinWordLen;
For WI:= 1 To WL Do W:= W + cLetters[Random(Length(cLetters)-1)+1];
End;
WriteLine(W);
End;
Finally
Free;
End;
End;
Function Lookup(Const S: String): Boolean;
Var First, Last, Idx, Cmp: Integer;
LCase, CurStr: String;
Begin
//Lookup "S" using the sorted index
Result:= False;
If OutIndex.Count = 0 Then Begin
//Empty, adjust insert position
InsertAt:= 0;
Exit;
End;
//Lower case for case insensitivity
LCase:= LowerCase(S);
//Adjust first and last element
First:= 0;
Last:= OutIndex.Count - 1;
//This is a basic binary search routine, removing half of the entries
//for each lookup. This is extremely efficient for large data amounts
//but it requires the values to be sorted
While First <= Last Do Begin
//Find index to test (middle of the search range)
Idx:= ((Last - First) Div 2) + First;
//Load the element as lowercase from OutFile
OutFile.Position:= PInt64(OutIndex[Idx])^;
CurStr:= LowerCase(OutFile.ReadLine);
//Compare elements
Cmp:= CompareStr(LCase, CurStr);
//Handle comparison:
If Cmp < 0 Then Last:= Idx - 1 //If less, we cut away upper half
Else If Cmp > 0 Then First:= Idx + 1 //If greater, we cut away the lower half
Else Begin
//We have a match / duplicate
Result:= True;
Exit;
End;
End;
//Adjust the insert position of the tested string
If Cmp > 0 Then Inc(Idx);
InsertAt:= Idx;
End;
Procedure Insert(Const S: String);
Var I: PInt64;
Begin
//Insert a string to OutFIle and its index
New(I);
I^:= OutFile.Size;
OutFile.Position:= I^;
OutFile.WriteLine(S);
OutIndex.Insert(InsertAt, I);
End;
Procedure ExportSorted;
Var I: Integer;
Begin
//Export a sorted version of the de-duplicated file
With TFileStream.Create(FileName + '_sort', fmCreate) Do Try
For I:= 0 To OutIndex.Count - 1 Do Begin
OutFile.Position:= PInt64(OutIndex[I])^;
WriteLine(OutFile.ReadLine);
End;
Finally
Free;
End;
End;
Procedure FreeOutIndex;
Var I: Integer;
Begin
//Release the index
For I:= 0 To OutIndex.Count - 1 Do Dispose(PInt64(OutIndex[I]));
OutIndex.Free;
End;
Begin
//Create a file name
FileName:= ExtractFilePath(ParamStr(0)) + 'bigtextfile.txt';
//If the file does not exists, generate it
If Not FileExists(FileName) Then MakeRandomFile;
//Create input file, output file and index
InFile:= TFileSTream.Create(FileName, fmOpenRead);
OutFile:= TFileStream.Create(FileName + '_ddup', fmCreate);
OutIndex:= TList.Create;
Try
//Store memory used
MemUsed:= GetHeapStatus.TotalAllocated;
//Store tick
Tick:= GetTickCount64;
While InFile.Position < InFile.Size Do Begin
//Read current word
CurWord:= InFile.ReadLine;
//Lookup word and insert if unique
If Not Lookup(CurWord) Then Insert(CurWord)
Else WriteLn('Removing duplicate: ' + CurWord);
End;
//Show memory used
MemUsed:= GetHeapStatus.TotalAllocated - MemUsed;
WriteLn(Format('Total memory used is %d bytes', [MemUsed]));
//Show time used
Tick:= GetTickCount64 - Tick;
WriteLn(Format('Duplicates removed in %.3f seconds', [Tick/1000.0]));
//Export a sorted list of unique words and show speed
Tick:= GetTickCount64;
ExportSorted;
Tick:= GetTickCount64 - Tick;
WriteLn(Format('Export of sorted file completed in %.3f seconds', [Tick/1000.0]));
Finally
//Release da grease!
FreeOutIndex;
OutFile.Free;
InFile.Free;
End;
//Yep!!
WriteLn('Done');
End.