nissl-lab / toxy

.net text extraction framework
Apache License 2.0
361 stars 107 forks source link

Improved code for Docx Files #26

Open pcinfogmach opened 3 months ago

pcinfogmach commented 3 months ago
using DocumentFormat.OpenXml.Packaging;
using NPOI.HWPF;
using NPOI.HWPF.Extractor;
using System;
using System.IO;
using System.Runtime.InteropServices;
using System.Text;
using System.Xml;
using WordInterop = Microsoft.Office.Interop.Word;

namespace MsWordTextExtractor
{
    public static class DocxTextExtractor
    {
        public static string Extract(string filePath)
        {
            try
            {
                return ReadAllTextParts(filePath);
            }
            catch
            {
                try
                {
                    return NpoiDocExtractor(filePath);
                }
                catch
                {
                    try
                    {
                        return WordInteropExtractor(filePath);
                    }
                    catch (Exception ex)
                    {
                        Console.WriteLine(ex.Message);
                         return string.Empty; 
                    }
                }              
            }
        }

        static string ReadAllTextParts(string filePath)
        {
            StringBuilder stb = new StringBuilder();
            using (WordprocessingDocument wordprocessingDocument = WordprocessingDocument.Open(filePath, false))
            {
                var mainPart = wordprocessingDocument.MainDocumentPart;
                stb.AppendLine(ReadTextPart(mainPart.GetStream()));

                if (mainPart.FootnotesPart != null)
                {
                    string footNotes = ReadFootnotesPart(mainPart.FootnotesPart.GetStream());
                    if (!string.IsNullOrEmpty(footNotes))
                    {
                        stb.AppendLine();
                        stb.AppendLine(footNotes);
                    }
                }

                if (mainPart.EndnotesPart != null)
                {
                    string footNotes = ReadFootnotesPart(mainPart.FootnotesPart.GetStream());
                    if (!string.IsNullOrEmpty(footNotes))
                    {
                        stb.AppendLine();
                        stb.AppendLine(footNotes);
                    }
                }
            }
            return stb.ToString();
        }

        static string ReadTextPart(Stream partStream)
        {
            NameTable nameTable = new NameTable();
            XmlNamespaceManager xmlNamespaceManager = new XmlNamespaceManager(nameTable);
            xmlNamespaceManager.AddNamespace("w", "http://schemas.openxmlformats.org/wordprocessingml/2006/main");
            StringBuilder stringBuilder = new StringBuilder();

            XmlDocument xmlDocument = new XmlDocument(nameTable);
            xmlDocument.Load(partStream);

            XmlNodeList paragraphNodes = xmlDocument.SelectNodes("//w:p", xmlNamespaceManager);
            foreach (XmlNode paragraphNode in paragraphNodes)
            {
                ReadTextContent(stringBuilder, paragraphNode, xmlNamespaceManager);
                stringBuilder.Append(Environment.NewLine);
            }
            return stringBuilder.ToString().Trim();
        }

        static string ReadFootnotesPart(Stream partStream)
        {
            NameTable nameTable = new NameTable();
            XmlNamespaceManager xmlNamespaceManager = new XmlNamespaceManager(nameTable);
            xmlNamespaceManager.AddNamespace("w", "http://schemas.openxmlformats.org/wordprocessingml/2006/main");
            StringBuilder stringBuilder = new StringBuilder();

            XmlDocument xmlDocument = new XmlDocument(nameTable);
            xmlDocument.Load(partStream);

            XmlNodeList footnoteNodes = xmlDocument.SelectNodes("//w:footnote | .//w:endnote", xmlNamespaceManager);
            foreach (XmlNode footnoteNode in footnoteNodes)
            {
                string footnoteId = footnoteNode.Attributes["w:id"].Value;
                if (footnoteId == "-1" || footnoteId == "0") { continue; }
                stringBuilder.Append($"{footnoteId}");

                ReadTextContent(stringBuilder, footnoteNode, xmlNamespaceManager);

                stringBuilder.AppendLine();
            }
            return stringBuilder.ToString().Trim();
        }

        static void ReadTextContent(StringBuilder stringBuilder, XmlNode xmlNode, XmlNamespaceManager xmlNamespaceManager)
        {
            XmlNodeList textNodes = xmlNode.SelectNodes(".//w:t | .//w:tab | .//w:br | .//w:footnoteReference | .//w:numPr", xmlNamespaceManager);
            foreach (XmlNode textNode in textNodes)
            {
                switch (textNode.Name)
                {
                    case "w:t":
                        stringBuilder.Append(textNode.InnerText);
                        break;

                    case "w:tab":
                        stringBuilder.Append("\t");
                        break;

                    case "w:br":
                        stringBuilder.Append("\v");
                        break;

                    case "w:footnoteReference":
                        string footnoteId = textNode.Attributes["w:id"].Value;
                        stringBuilder.Append($"{footnoteId}");
                        break;

                    case "w:numPr":
                        XmlNode ilvlNode = textNode.SelectSingleNode(".//w:ilvl", xmlNamespaceManager);
                        XmlNode numIdNode = textNode.SelectSingleNode(".//w:numId", xmlNamespaceManager);
                        if (ilvlNode != null && numIdNode != null)
                        {
                            stringBuilder.Append("*");
                        }
                        break;
                }
            }
        }

        static string NpoiDocExtractor(string filePath)
        {
            using (FileStream fileStream = new FileStream(filePath, FileMode.Open, FileAccess.Read))
            {
                HWPFDocument doc = new HWPFDocument(fileStream);
                WordExtractor extractor = new WordExtractor(doc);
                return extractor.Text;
            }
        }

        public static string WordInteropExtractor(string filePath)
        {
            string tempFilePath = Path.Combine(Path.GetTempPath(), Path.GetFileNameWithoutExtension(filePath) + ".txt");

            try
            {
                using (WordApp wordApp = new WordApp())
                {
                    WordInterop.Document doc = null;
                    bool isFileAlreadyOpen = false;

                    foreach (WordInterop.Document openDoc in wordApp.App.Documents)
                    {
                        if (openDoc.FullName.Equals(Path.GetFullPath(filePath), StringComparison.OrdinalIgnoreCase))
                        {
                            doc = openDoc;
                            isFileAlreadyOpen = true;
                            break;
                        }
                    }

                    if (doc == null) doc = wordApp.App.Documents.Open(filePath, ReadOnly: true, Visible: false);

                    var originalFormat = doc.SaveFormat;
                    doc.SaveAs2(tempFilePath, WordInterop.WdSaveFormat.wdFormatUnicodeText, Encoding: 65001, AddToRecentFiles: false);
                    if (isFileAlreadyOpen) doc.SaveAs2(filePath, originalFormat);

                    if (doc != null && !isFileAlreadyOpen) doc.Close(WordInterop.WdSaveOptions.wdDoNotSaveChanges);
                }

                return File.ReadAllText(tempFilePath);
            }
            finally
            {
                if (File.Exists(tempFilePath)) File.Delete(tempFilePath);
            }
        }
    }

    class WordApp : IDisposable
    {
        public Microsoft.Office.Interop.Word.Application App;
        bool isNewApp;

        public WordApp()
        {
            try
            {
                App = (WordInterop.Application)Marshal.GetActiveObject("Word.Application");
            }
            catch (COMException)
            {
                App = new WordInterop.Application();
                isNewApp = true;
            }
        }

        public void Dispose()
        {
            if (isNewApp && App != null)
            {
                App.Quit();
                Marshal.ReleaseComObject(App);
            }
        }
    }
}
tonyqus commented 1 month ago

Why do you use HWPF for docx? Toxy has Word2007DocumentParser and Word2007TextParser