Open Meir017 opened 6 years ago
when iterating through a pdf file a common way is to write some kind of a recursive method to do this.
A visitor class will make this more simple, something link this:
public class PdfCObjectVisitor { // the CObject class should contain a virtual Accept method public void Accept(CObject @object) => VisitObject(@object); public virtual void VisitName(CName name) { } public virtual void VisitString(CString @string) { } public virtual void VisitOperator(COperator @operator) { VisitSequence(@operator.Operands); } public virtual void VisitComment(CComment comment) { } public virtual void VisitArray(CArray array) { foreach (var @object in array) { VisitObject(@object); } } public virtual void VisitInterger(CInteger integer) { } public virtual void VisitReal(CReal real) { } public virtual void VisitNumber(CNumber number) { } public virtual void VisitSequence(CSequence sequence) { foreach (var @object in sequence) { VisitObject(@object); } } public virtual void VisitObject(CObject @object) { switch (@object) { case CName name: VisitName(name); break; case CString @string: VisitString(@string); break; case COperator @operator: VisitOperator(@operator); break; case CComment comment: VisitComment(comment); break; case CArray array: VisitArray(array); break; case CInteger integer: VisitInterger(integer); break; case CReal real: VisitReal(real); break; case CNumber number: VisitNumber(number); break; case CSequence sequence: VisitSequence(sequence); break; } } }
then to write a class that extract all of the text from a pdf is really simple:
public class TextExtractorPdfVisitor : PdfCObjectVisitor { public StringBuilder Builder { get; } = new StringBuilder(); public override void VisitOperator(COperator @operator) { if (@operator.OpCode.OpCodeName != OpCodeName.TJ && @operator.OpCode.OpCodeName != OpCodeName.Tj) { return; } base.VisitOperator(@operator); } public override void VisitString(CString @string) { Builder.Append(@string.Value); } }
when iterating through a pdf file a common way is to write some kind of a recursive method to do this.
A visitor class will make this more simple, something link this:
then to write a class that extract all of the text from a pdf is really simple: