empira / PDFsharp-1.5

A .NET library for processing PDF
MIT License
1.28k stars 588 forks source link

[feature] support for Visitor pattern #68

Open Meir017 opened 6 years ago

Meir017 commented 6 years ago

when iterating through a pdf file a common way is to write some kind of a recursive method to do this.

A visitor class will make this more simple, something link this:

public class PdfCObjectVisitor
{
    // the CObject class should contain a virtual Accept method
    public void Accept(CObject @object) => VisitObject(@object);

    public virtual void VisitName(CName name)
    {
    }
    public virtual void VisitString(CString @string)
    {
    }
    public virtual void VisitOperator(COperator @operator)
    {
        VisitSequence(@operator.Operands);
    }
    public virtual void VisitComment(CComment comment)
    {
    }
    public virtual void VisitArray(CArray array)
    {
        foreach (var @object in array)
        {
            VisitObject(@object);
        }
    }
    public virtual void VisitInterger(CInteger integer)
    {
    }
    public virtual void VisitReal(CReal real)
    {
    }
    public virtual void VisitNumber(CNumber number)
    {
    }
    public virtual void VisitSequence(CSequence sequence)
    {
        foreach (var @object in sequence)
        {
            VisitObject(@object);
        }
    }
    public virtual void VisitObject(CObject @object)
    {
        switch (@object)
        {
            case CName name:
                VisitName(name);
                break;
            case CString @string:
                VisitString(@string);
                break;
            case COperator @operator:
                VisitOperator(@operator);
                break;
            case CComment comment:
                VisitComment(comment);
                break;
            case CArray array:
                VisitArray(array);
                break;
            case CInteger integer:
                VisitInterger(integer);
                break;
            case CReal real:
                VisitReal(real);
                break;
            case CNumber number:
                VisitNumber(number);
                break;
            case CSequence sequence:
                VisitSequence(sequence);
                break;
        }
    }
}

then to write a class that extract all of the text from a pdf is really simple:

public class TextExtractorPdfVisitor : PdfCObjectVisitor
{
    public StringBuilder Builder { get; } = new StringBuilder();
    public override void VisitOperator(COperator @operator)
    {
        if (@operator.OpCode.OpCodeName != OpCodeName.TJ
            && @operator.OpCode.OpCodeName != OpCodeName.Tj)
        {
            return;
        }
        base.VisitOperator(@operator);
    }
    public override void VisitString(CString @string)
    {
        Builder.Append(@string.Value);
    }
}