plutext / docx4j

JAXB-based Java library for Word docx, Powerpoint pptx, and Excel xlsx files
https://www.docx4java.org/
2.11k stars 1.2k forks source link

chinese issue #259

Open TimothyWang123 opened 7 years ago

TimothyWang123 commented 7 years ago

I used docx4j to transferred the .docx file to .pdf file. everything is ok except the chinese words turned to be messy code.... image

plutext commented 7 years ago

Missing font. Did you use export-fo, or Plutext's commercial converter?

TimothyWang123 commented 7 years ago

hi, I wrote as the example in github. here is my code.

import java.io.File; import java.io.OutputStream;

import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.docx4j.Docx4J; import org.docx4j.convert.out.FOSettings; import org.docx4j.fonts.IdentityPlusMapper; import org.docx4j.fonts.Mapper; import org.docx4j.fonts.PhysicalFont; import org.docx4j.fonts.PhysicalFonts; import org.docx4j.model.fields.FieldUpdater; import org.docx4j.openpackaging.exceptions.Docx4JException; import org.docx4j.openpackaging.packages.WordprocessingMLPackage; import org.docx4j.samples.AbstractSample; import org.docx4j.services.client.ConversionException;

public class ConvertOutPDF extends AbstractSample { // Config for non-command line use static {

    inputfilepath = null; // to generate a docx (and PDF output) containing font samples

    inputfilepath = "C:/Users/wangzh20785/Desktop/swarm.docx";

    // URL of converter instance

// Docx4jProperties.setProperty("com.plutext.converter.URL", "http://localhost:9016/v1/00000000-0000-0000-0000-000000000000/convert");

    // XSL-FO only
    saveFO = true;
}

// For demo/debugging purposes, save the intermediate XSL FO
// Don't do this in production!
static boolean saveFO;

public static void main(String[] args) throws Exception {
    try {
        getInputFilePath(args);
    } catch (IllegalArgumentException e) {

    }

    // Font regex (optional)
    // Set regex if you want to restrict to some defined subset of fonts
    // Here we have to do this before calling createContent,
    // since that discovers fonts
    String regex = null;
    // Windows:
    // String
    // regex=".*(calibri|camb|cour|arial|symb|times|Times|zapf).*";

// regex=".(calibri|camb|cour|arial|times|comic|georgia|impact|LSANS|pala|tahoma|trebuc|verdana|symbol|webdings|wingding)."; // Mac // String

    // Document loading (required)
    WordprocessingMLPackage wordMLPackage;
    if (inputfilepath==null) {
        // Create a docx
        System.out.println("No imput path passed, creating dummy document");
         wordMLPackage = WordprocessingMLPackage.createPackage();

// SampleDocument.createContent(wordMLPackage.getMainDocumentPart()); } else { // Load .docx or Flat OPC .xml System.out.println("Loading file from " + inputfilepath); wordMLPackage = WordprocessingMLPackage.load(new java.io.File(inputfilepath)); }

    // Refresh the values of DOCPROPERTY fields 
    FieldUpdater updater = new FieldUpdater(wordMLPackage);
    updater.update(true);

    String outputfilepath;
    if (inputfilepath==null) {
        outputfilepath = "C:/Users/wangzh20785/Desktop/swarm.pdf";          
    } else {
        outputfilepath = "C:/Users/wangzh20785/Desktop/swarm.pdf";
    }

    // All methods write to an output stream
    OutputStream os = new java.io.FileOutputStream(outputfilepath);

    if (!Docx4J.pdfViaFO()) {
        // Since 3.3.0, Plutext's PDF Converter is used by default

        System.out.println("Using Plutext's PDF Converter; add docx4j-export-fo if you don't want that");
        Mapper fontMapper = new IdentityPlusMapper(); 

// Mapper fontMapper = new IdentityPlusMapper(); // String fontFamily = "SimSun"; // PhysicalFont simsunFont = PhysicalFonts.get(fontFamily); // fontMapper.put(fontFamily, simsunFont); // regex=".(SimSun|calibri|camb|cour|arial|times|comic|georgia|impact|LSANS|pala|tahoma|trebuc|verdana|symbol|webdings|wingding)."; // PhysicalFonts.setRegex(regex);

// PhysicalFont font = PhysicalFonts.getPhysicalFonts().get("Arial Unicode MS"); // if (font!=null) { // fontMapper.getFontMappings().put("Times New Roman", font); // } fontMapper.put("隶书", PhysicalFonts.get("LiSu")); fontMapper.put("宋体",PhysicalFonts.get("SimSun")); fontMapper.put("微软雅黑",PhysicalFonts.get("Microsoft Yahei")); fontMapper.put("黑体",PhysicalFonts.get("SimHei")); fontMapper.put("楷体",PhysicalFonts.get("KaiTi")); fontMapper.put("新宋体",PhysicalFonts.get("NSimSun")); fontMapper.put("华文行楷", PhysicalFonts.get("STXingkai")); fontMapper.put("华文仿宋", PhysicalFonts.get("STFangsong")); fontMapper.put("宋体扩展",PhysicalFonts.get("simsun-extB")); fontMapper.put("仿宋",PhysicalFonts.get("FangSong")); fontMapper.put("仿宋_GB2312",PhysicalFonts.get("FangSong_GB2312")); fontMapper.put("幼圆",PhysicalFonts.get("YouYuan")); fontMapper.put("华文宋体",PhysicalFonts.get("STSong")); fontMapper.put("华文中宋",PhysicalFonts.get("STZhongsong")); wordMLPackage.setFontMapper(fontMapper);

        try {
            System.out.println("我要开始转换啦!");

// FOSettings foSettings = Docx4J.createFOSettings();
// foSettings.setWmlPackage(wordMLPackage);
// Docx4J.toFO(foSettings, os, Docx4J.FLAG_EXPORT_PREFER_XSL);
Docx4J.toPDF(wordMLPackage, os); } catch (Docx4JException e) { System.out.println("我有错!"); e.printStackTrace(); IOUtils.closeQuietly(os); System.out.println(FileUtils.readFileToString(new File(outputfilepath))); if (e.getCause()!=null&& e.getCause() instanceof ConversionException) { ConversionException ce = (ConversionException)e.getCause(); ce.printStackTrace(); } } System.out.println("Saved: " + outputfilepath);

        return;
    }

    System.out.println("Attempting to use XSL FO");

    regex=".*(Courier New|Arial|Times New Roman|Comic Sans|Georgia|Impact|Lucida Console|Lucida Sans Unicode|Palatino Linotype|Tahoma|Trebuchet|Verdana|Symbol|Webdings|Wingdings|MS Sans Serif|MS Serif|SimSun).*";
    PhysicalFonts.setRegex(regex);

    // Set up font mapper (optional)
    Mapper fontMapper = new IdentityPlusMapper();
    wordMLPackage.setFontMapper(fontMapper);

    PhysicalFont font = PhysicalFonts.get("Arial Unicode MS"); 
        // make sure this is in your regex (if any)!!!

// if (font!=null) { // fontMapper.put("Times New Roman", font); // fontMapper.put("Arial", font); fontMapper.put("隶书", PhysicalFonts.get("LiSu")); fontMapper.put("宋体",PhysicalFonts.get("SimSun")); fontMapper.put("微软雅黑",PhysicalFonts.get("Microsoft Yahei")); fontMapper.put("黑体",PhysicalFonts.get("SimHei")); fontMapper.put("楷体",PhysicalFonts.get("KaiTi")); fontMapper.put("新宋体",PhysicalFonts.get("NSimSun")); fontMapper.put("华文行楷", PhysicalFonts.get("STXingkai")); fontMapper.put("华文仿宋", PhysicalFonts.get("STFangsong")); fontMapper.put("宋体扩展",PhysicalFonts.get("simsun-extB")); fontMapper.put("仿宋",PhysicalFonts.get("FangSong")); fontMapper.put("仿宋_GB2312",PhysicalFonts.get("FangSong_GB2312")); fontMapper.put("幼圆",PhysicalFonts.get("YouYuan")); fontMapper.put("华文宋体",PhysicalFonts.get("STSong")); fontMapper.put("华文中宋",PhysicalFonts.get("STZhongsong")); // } // fontMapper.put("Libian SC Regular", PhysicalFonts.get("SimSun"));

    FOSettings foSettings = Docx4J.createFOSettings();
    if (saveFO) {
        foSettings.setFoDumpFile(new java.io.File(inputfilepath + ".fo"));
    }
    foSettings.setWmlPackage(wordMLPackage);

    // Document format: 
    // The default implementation of the FORenderer that uses Apache Fop will output
    // a PDF document if nothing is passed via 
    // foSettings.setApacheFopMime(apacheFopMime)
    // apacheFopMime can be any of the output formats defined in org.apache.fop.apps.MimeConstants eg org.apache.fop.apps.MimeConstants.MIME_FOP_IF or
    // FOSettings.INTERNAL_FO_MIME if you want the fo document as the result.
    //foSettings.setApacheFopMime(FOSettings.INTERNAL_FO_MIME);

    // Specify whether PDF export uses XSLT or not to create the FO
    // (XSLT takes longer, but is more complete).

    // Don't care what type of exporter you use
    Docx4J.toFO(foSettings, os, Docx4J.FLAG_EXPORT_PREFER_XSL);

    // Prefer the exporter, that uses a xsl transformation
    // Docx4J.toFO(foSettings, os, Docx4J.FLAG_EXPORT_PREFER_XSL);

    // Prefer the exporter, that doesn't use a xsl transformation (= uses a visitor)
    // .. faster, but not yet at feature parity
    // Docx4J.toFO(foSettings, os, Docx4J.FLAG_EXPORT_PREFER_NONXSL);

    System.out.println("Saved: " + outputfilepath+"这是第二个saved");

    // Clean up, so any ObfuscatedFontPart temp files can be deleted 
    if (wordMLPackage.getMainDocumentPart().getFontTablePart()!=null) {
        wordMLPackage.getMainDocumentPart().getFontTablePart().deleteEmbeddedFontTempFiles();
    }       
    // This would also do it, via finalize() methods
    updater = null;
    foSettings = null;
    wordMLPackage = null;

}

}

plutext commented 7 years ago

Is docx4j-export-FO on your class path? If not, you are using Plutext's commercial converter. If you are using Plutext's commercial converter, download/install your own instance (from converter-eval.plutext.com), and if necessary, add the relevant Chinese fonts to its install dir.

TimothyWang123 commented 7 years ago

@plutext hi,thanks. when I tried to download from converter-eval.plutext.com, it needs a work email address,but I am still a student in china...... is there anyway to transfer word to pdf just by code in windows and linux system? and free!

TimothyWang123 commented 7 years ago

@plutext hi, I set the font but it still didnt work...sad...and I use the demo website on the github, it didnt work also...

image

plutext commented 7 years ago

We don't have Chinese fonts installed on our instance at converter-eval.plutext.com; happy to send you a link if you tell me which download you'd like.

the fontMapper stuff is only for docx4j-export-FO; have you got that on your classpath now?

plutext commented 7 years ago

I've just done some testing. Should work if you use export-FO jar; you can try https://www.docx4java.org/docx4j/docx4j-export-fo-nightly-20170721.jar

TylerCheung008 commented 6 years ago

Win和Mac用TimothyWang123 方法可以解决docx转pdf乱码问题 关于中文的docx转pdf乱吗问题,Linux系统需要安装字体库