Open pcinfogmach opened 3 months ago
using DocumentFormat.OpenXml.Packaging; using NPOI.HWPF; using NPOI.HWPF.Extractor; using System; using System.IO; using System.Runtime.InteropServices; using System.Text; using System.Xml; using WordInterop = Microsoft.Office.Interop.Word; namespace MsWordTextExtractor { public static class DocxTextExtractor { public static string Extract(string filePath) { try { return ReadAllTextParts(filePath); } catch { try { return NpoiDocExtractor(filePath); } catch { try { return WordInteropExtractor(filePath); } catch (Exception ex) { Console.WriteLine(ex.Message); return string.Empty; } } } } static string ReadAllTextParts(string filePath) { StringBuilder stb = new StringBuilder(); using (WordprocessingDocument wordprocessingDocument = WordprocessingDocument.Open(filePath, false)) { var mainPart = wordprocessingDocument.MainDocumentPart; stb.AppendLine(ReadTextPart(mainPart.GetStream())); if (mainPart.FootnotesPart != null) { string footNotes = ReadFootnotesPart(mainPart.FootnotesPart.GetStream()); if (!string.IsNullOrEmpty(footNotes)) { stb.AppendLine(); stb.AppendLine(footNotes); } } if (mainPart.EndnotesPart != null) { string footNotes = ReadFootnotesPart(mainPart.FootnotesPart.GetStream()); if (!string.IsNullOrEmpty(footNotes)) { stb.AppendLine(); stb.AppendLine(footNotes); } } } return stb.ToString(); } static string ReadTextPart(Stream partStream) { NameTable nameTable = new NameTable(); XmlNamespaceManager xmlNamespaceManager = new XmlNamespaceManager(nameTable); xmlNamespaceManager.AddNamespace("w", "http://schemas.openxmlformats.org/wordprocessingml/2006/main"); StringBuilder stringBuilder = new StringBuilder(); XmlDocument xmlDocument = new XmlDocument(nameTable); xmlDocument.Load(partStream); XmlNodeList paragraphNodes = xmlDocument.SelectNodes("//w:p", xmlNamespaceManager); foreach (XmlNode paragraphNode in paragraphNodes) { ReadTextContent(stringBuilder, paragraphNode, xmlNamespaceManager); stringBuilder.Append(Environment.NewLine); } return stringBuilder.ToString().Trim(); } static string ReadFootnotesPart(Stream partStream) { NameTable nameTable = new NameTable(); XmlNamespaceManager xmlNamespaceManager = new XmlNamespaceManager(nameTable); xmlNamespaceManager.AddNamespace("w", "http://schemas.openxmlformats.org/wordprocessingml/2006/main"); StringBuilder stringBuilder = new StringBuilder(); XmlDocument xmlDocument = new XmlDocument(nameTable); xmlDocument.Load(partStream); XmlNodeList footnoteNodes = xmlDocument.SelectNodes("//w:footnote | .//w:endnote", xmlNamespaceManager); foreach (XmlNode footnoteNode in footnoteNodes) { string footnoteId = footnoteNode.Attributes["w:id"].Value; if (footnoteId == "-1" || footnoteId == "0") { continue; } stringBuilder.Append($"{footnoteId}"); ReadTextContent(stringBuilder, footnoteNode, xmlNamespaceManager); stringBuilder.AppendLine(); } return stringBuilder.ToString().Trim(); } static void ReadTextContent(StringBuilder stringBuilder, XmlNode xmlNode, XmlNamespaceManager xmlNamespaceManager) { XmlNodeList textNodes = xmlNode.SelectNodes(".//w:t | .//w:tab | .//w:br | .//w:footnoteReference | .//w:numPr", xmlNamespaceManager); foreach (XmlNode textNode in textNodes) { switch (textNode.Name) { case "w:t": stringBuilder.Append(textNode.InnerText); break; case "w:tab": stringBuilder.Append("\t"); break; case "w:br": stringBuilder.Append("\v"); break; case "w:footnoteReference": string footnoteId = textNode.Attributes["w:id"].Value; stringBuilder.Append($"{footnoteId}"); break; case "w:numPr": XmlNode ilvlNode = textNode.SelectSingleNode(".//w:ilvl", xmlNamespaceManager); XmlNode numIdNode = textNode.SelectSingleNode(".//w:numId", xmlNamespaceManager); if (ilvlNode != null && numIdNode != null) { stringBuilder.Append("*"); } break; } } } static string NpoiDocExtractor(string filePath) { using (FileStream fileStream = new FileStream(filePath, FileMode.Open, FileAccess.Read)) { HWPFDocument doc = new HWPFDocument(fileStream); WordExtractor extractor = new WordExtractor(doc); return extractor.Text; } } public static string WordInteropExtractor(string filePath) { string tempFilePath = Path.Combine(Path.GetTempPath(), Path.GetFileNameWithoutExtension(filePath) + ".txt"); try { using (WordApp wordApp = new WordApp()) { WordInterop.Document doc = null; bool isFileAlreadyOpen = false; foreach (WordInterop.Document openDoc in wordApp.App.Documents) { if (openDoc.FullName.Equals(Path.GetFullPath(filePath), StringComparison.OrdinalIgnoreCase)) { doc = openDoc; isFileAlreadyOpen = true; break; } } if (doc == null) doc = wordApp.App.Documents.Open(filePath, ReadOnly: true, Visible: false); var originalFormat = doc.SaveFormat; doc.SaveAs2(tempFilePath, WordInterop.WdSaveFormat.wdFormatUnicodeText, Encoding: 65001, AddToRecentFiles: false); if (isFileAlreadyOpen) doc.SaveAs2(filePath, originalFormat); if (doc != null && !isFileAlreadyOpen) doc.Close(WordInterop.WdSaveOptions.wdDoNotSaveChanges); } return File.ReadAllText(tempFilePath); } finally { if (File.Exists(tempFilePath)) File.Delete(tempFilePath); } } } class WordApp : IDisposable { public Microsoft.Office.Interop.Word.Application App; bool isNewApp; public WordApp() { try { App = (WordInterop.Application)Marshal.GetActiveObject("Word.Application"); } catch (COMException) { App = new WordInterop.Application(); isNewApp = true; } } public void Dispose() { if (isNewApp && App != null) { App.Quit(); Marshal.ReleaseComObject(App); } } } }
Why do you use HWPF for docx? Toxy has Word2007DocumentParser and Word2007TextParser