Start of iTextSharp Custom Module - extract text from PDF

For this test, I installed the deprecated iTextSharp, which has the advantage of an LGPL version if needed and a single DLL to add to the Custom Modules folder.

Listing of ExtractTextFromPdf.cs:

using System;
using Robin.Core;
using Robin.Core.Attributes;
using System.Text;
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;

namespace Modules.iText
{
    [Action(Order = 1)]
    [Throws("ActionError")] // TODO: change error name (or delete if not needed)
    public class ExtractTextFromPdf : ActionBase
    {
        #region Properties

        // NOTE: You can find sample description and friendly name entries in Resources

        [InputArgument]
        public string PdfPath { get; set; }

        [OutputArgument]
        public string PdfText { get; set; }

        #endregion

        #region Methods Overrides

        public override void Execute(ActionContext context)
        {
            try
            {
                using (PdfReader reader = new PdfReader(PdfPath))
                {
                    StringBuilder text = new StringBuilder();

                    for (int i = 1; i <= reader.NumberOfPages; i++)
                    {
                        text.Append(PdfTextExtractor.GetTextFromPage(reader, i));
                    }

                    PdfText = text.ToString();
                }
            }
            catch (Exception e)
            {
                if (e is ActionException) throw;

                throw new ActionException("ActionError", e.Message, e.InnerException);
            }

            // TODO: set values to Output Arguments here
        }

        #endregion
    }
}

Output:

Regards,
burque505

1 Like