使用POI实现word转html

超、凢脫俗 2023-01-22 10:51 51阅读 0赞

一、maven依赖

  1. <!-- 针对2007以上版本的库 -->
  2. <dependency>
  3. <groupId>org.apache.poi</groupId>
  4. <artifactId>poi-ooxml</artifactId>
  5. <version>4.1.2</version>
  6. </dependency>
  7. <!-- 针对2003版本的库 -->
  8. <dependency>
  9. <groupId>org.apache.poi</groupId>
  10. <artifactId>poi-scratchpad</artifactId>
  11. <version>4.1.2</version>
  12. </dependency>
  13. <dependency>
  14. <groupId>fr.opensagres.xdocreport</groupId>
  15. <artifactId>fr.opensagres.poi.xwpf.converter.xhtml</artifactId>
  16. <version>2.0.2</version>
  17. </dependency>

经我测试只需要这些依赖即可完成引用。网上大部分依赖都已较为过时,而且来源都差不多。其它文章中提到的库“org.apache.poi.xwpf.converter.xhtml”的artifactId已经修改为“fr.opensagres”开头,如果有其它需要依赖的按需加入即可。poi新版本API有变化,所以对应的其他包需要引入最新版本。

二、转换代码

  1. import cn.hutool.core.img.ImgUtil;
  2. import fr.opensagres.poi.xwpf.converter.xhtml.Base64EmbedImgManager;
  3. import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
  4. import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
  5. import org.apache.poi.hwpf.HWPFDocument;
  6. import org.apache.poi.hwpf.converter.WordToHtmlConverter;
  7. import org.apache.poi.openxml4j.util.ZipSecureFile;
  8. import org.apache.poi.xwpf.usermodel.XWPFDocument;
  9. import org.w3c.dom.Document;
  10. import javax.xml.parsers.DocumentBuilderFactory;
  11. import javax.xml.parsers.ParserConfigurationException;
  12. import javax.xml.transform.OutputKeys;
  13. import javax.xml.transform.Transformer;
  14. import javax.xml.transform.TransformerException;
  15. import javax.xml.transform.TransformerFactory;
  16. import javax.xml.transform.dom.DOMSource;
  17. import javax.xml.transform.stream.StreamResult;
  18. import java.awt.image.BufferedImage;
  19. import java.io.*;
  20. /**
  21. * office转换工具测试
  22. *
  23. */
  24. public class OfficeConvertUtil {
  25. /**
  26. * 将word2003转换为html文件 2017-2-27
  27. *
  28. * @param wordPath word文件路径
  29. * @param wordName word文件名称无后缀
  30. * @param suffix word文件后缀
  31. * @throws IOException
  32. * @throws TransformerException
  33. * @throws ParserConfigurationException
  34. */
  35. public static String Word2003ToHtml(String wordPath, String wordName,
  36. String suffix) throws IOException, TransformerException,
  37. ParserConfigurationException {
  38. String htmlPath = wordPath + File.separator + "html"
  39. + File.separator;
  40. String htmlName = wordName + ".html";
  41. final String imagePath = htmlPath + "image" + File.separator;
  42. // 判断html文件是否存在,每次重新生成
  43. File htmlFile = new File(htmlPath + htmlName);
  44. // if (htmlFile.exists()) {
  45. // return htmlFile.getAbsolutePath();
  46. // }
  47. // 原word文档
  48. final String file = wordPath + File.separator + wordName + suffix;
  49. InputStream input = new FileInputStream(new File(file));
  50. HWPFDocument wordDocument = new HWPFDocument(input);
  51. WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
  52. DocumentBuilderFactory.newInstance().newDocumentBuilder()
  53. .newDocument());
  54. wordToHtmlConverter.setPicturesManager((content, pictureType, suggestedName, widthInches, heightInches) -> {
  55. BufferedImage bufferedImage = ImgUtil.toImage(content);
  56. String base64Img = ImgUtil.toBase64(bufferedImage, pictureType.getExtension());
  57. // 带图片的word,则将图片转为base64编码,保存在一个页面中
  58. StringBuilder sb = (new StringBuilder(base64Img.length() + "data:;base64,".length()).append("data:;base64,").append(base64Img));
  59. return sb.toString();
  60. });
  61. // 解析word文档
  62. wordToHtmlConverter.processDocument(wordDocument);
  63. Document htmlDocument = wordToHtmlConverter.getDocument();
  64. // 生成html文件上级文件夹
  65. File folder = new File(htmlPath);
  66. if (!folder.exists()) {
  67. folder.mkdirs();
  68. }
  69. // 生成html文件地址
  70. OutputStream outStream = new FileOutputStream(htmlFile);
  71. DOMSource domSource = new DOMSource(htmlDocument);
  72. StreamResult streamResult = new StreamResult(outStream);
  73. TransformerFactory factory = TransformerFactory.newInstance();
  74. Transformer serializer = factory.newTransformer();
  75. serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
  76. serializer.setOutputProperty(OutputKeys.INDENT, "yes");
  77. serializer.setOutputProperty(OutputKeys.METHOD, "html");
  78. serializer.transform(domSource, streamResult);
  79. outStream.close();
  80. return htmlFile.getAbsolutePath();
  81. }
  82. /**
  83. * 2007版本word转换成html 2017-2-27
  84. *
  85. * @param wordPath word文件路径
  86. * @param wordName word文件名称无后缀
  87. * @param suffix word文件后缀
  88. * @return
  89. * @throws IOException
  90. */
  91. public static String Word2007ToHtml(String wordPath, String wordName, String suffix)
  92. throws IOException {
  93. ZipSecureFile.setMinInflateRatio(-1.0d);
  94. String htmlPath = wordPath + File.separator + "html"
  95. + File.separator;
  96. String htmlName = wordName + ".html";
  97. String imagePath = htmlPath + "image" + File.separator;
  98. // 判断html文件是否存在
  99. File htmlFile = new File(htmlPath + htmlName);
  100. // if (htmlFile.exists()) {
  101. // return htmlFile.getAbsolutePath();
  102. // }
  103. // word文件
  104. File wordFile = new File(wordPath + File.separator + wordName + suffix);
  105. // 1) 加载word文档生成 XWPFDocument对象
  106. InputStream in = new FileInputStream(wordFile);
  107. XWPFDocument document = new XWPFDocument(in);
  108. // 2) 解析 XHTML配置 (这里设置IURIResolver来设置图片存放的目录)
  109. File imgFolder = new File(imagePath);
  110. // 带图片的word,则将图片转为base64编码,保存在一个页面中
  111. XHTMLOptions options = XHTMLOptions.create().indent(4).setImageManager(new Base64EmbedImgManager());
  112. // 3) 将 XWPFDocument转换成XHTML
  113. // 生成html文件上级文件夹
  114. File folder = new File(htmlPath);
  115. if (!folder.exists()) {
  116. folder.mkdirs();
  117. }
  118. OutputStream out = new FileOutputStream(htmlFile);
  119. XHTMLConverter.getInstance().convert(document, out, options);
  120. return htmlFile.getAbsolutePath();
  121. }
  122. public static void main(String[] args) throws Exception {
  123. System.out.println(Word2003ToHtml("D:\\temp\\word", "21", ".doc"));
  124. System.out.println(Word2007ToHtml("D:\\temp\\word", "3", ".docx"));
  125. }
  126. }

以上编码参考了https://www.cnblogs.com/zhaosq/p/12069087.html实现。内容基本一致,只是将图片的保存方式修改为base64编码,保存到网页文件中。

该文章转载自:https://zhuanlan.zhihu.com/p/139287354

发表评论

表情:
评论列表 (有 0 条评论,51人围观)

还没有评论,来说两句吧...

相关阅读