Java实现word转HTML

清疚 2023-06-18 13:57 42阅读 0赞

Java word转html主要是为了word的在线浏览。不过转换可用性不是很好。我们先开始开发吧

第一步:引用Maven

  1. <!-- Word转HTML start -->
  2. <dependency>
  3. <groupId>org.apache.poi</groupId>
  4. <artifactId>poi-scratchpad</artifactId>
  5. <version>3.14</version>
  6. </dependency>
  7. <dependency>
  8. <groupId>org.apache.poi</groupId>
  9. <artifactId>poi-ooxml</artifactId>
  10. <version>3.14</version>
  11. </dependency>
  12. <dependency>
  13. <groupId>fr.opensagres.xdocreport</groupId>
  14. <artifactId>xdocreport</artifactId>
  15. <version>1.0.6</version>
  16. </dependency>
  17. <dependency>
  18. <groupId>org.apache.poi</groupId>
  19. <artifactId>poi-ooxml-schemas</artifactId>
  20. <version>3.14</version>
  21. </dependency>
  22. <dependency>
  23. <groupId>org.apache.poi</groupId>
  24. <artifactId>ooxml-schemas</artifactId>
  25. <version>1.3</version>
  26. </dependency>
  27. <dependency>
  28. <groupId>org.jsoup</groupId>
  29. <artifactId>jsoup</artifactId>
  30. <version>1.11.3</version>
  31. </dependency>
  32. <!-- Word转HTML end -->

第二步:java实现代码

原始版本;

  1. package com.mmxpw.mmw.file.view.word;
  2. import org.apache.poi.hwpf.HWPFDocument;
  3. import org.apache.poi.hwpf.converter.PicturesManager;
  4. import org.apache.poi.hwpf.converter.WordToHtmlConverter;
  5. import org.apache.poi.hwpf.usermodel.PictureType;
  6. import org.apache.poi.xwpf.converter.core.BasicURIResolver;
  7. import org.apache.poi.xwpf.converter.core.FileImageExtractor;
  8. import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
  9. import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
  10. import org.apache.poi.xwpf.usermodel.XWPFDocument;
  11. import org.w3c.dom.Document;
  12. import javax.xml.parsers.DocumentBuilderFactory;
  13. import javax.xml.parsers.ParserConfigurationException;
  14. import javax.xml.transform.OutputKeys;
  15. import javax.xml.transform.Transformer;
  16. import javax.xml.transform.TransformerException;
  17. import javax.xml.transform.TransformerFactory;
  18. import javax.xml.transform.dom.DOMSource;
  19. import javax.xml.transform.stream.StreamResult;
  20. import java.io.*;
  21. public class WordToHtml {
  22. /**
  23. * 将word2003转换为html文件
  24. *
  25. * @param wordPath word文件路径
  26. * @param wordName word文件名称无后缀
  27. * @param suffix word文件后缀
  28. * @param htmlPath html存储地址
  29. * @throws IOException
  30. * @throws TransformerException
  31. * @throws ParserConfigurationException
  32. */
  33. public static String Word2003ToHtml(String wordPath, String wordName, String suffix, String htmlPath)
  34. throws IOException, TransformerException, ParserConfigurationException {
  35. String htmlName = wordName + ".html";
  36. final String imagePath = htmlPath + "image" + File.separator;
  37. // 判断html文件是否存在
  38. File htmlFile = new File(htmlPath + htmlName);
  39. if (htmlFile.exists()) {
  40. return htmlFile.getAbsolutePath();
  41. }
  42. // 原word文档
  43. final String file = wordPath + File.separator + wordName + suffix;
  44. InputStream input = new FileInputStream(new File(file));
  45. HWPFDocument wordDocument = new HWPFDocument(input);
  46. WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
  47. DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
  48. // 设置图片存放的位置
  49. wordToHtmlConverter.setPicturesManager(new PicturesManager() {
  50. public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches,
  51. float heightInches) {
  52. File imgPath = new File(imagePath);
  53. if (!imgPath.exists()) {// 图片目录不存在则创建
  54. imgPath.mkdirs();
  55. }
  56. File file = new File(imagePath + suggestedName);
  57. try {
  58. OutputStream os = new FileOutputStream(file);
  59. os.write(content);
  60. os.close();
  61. } catch (FileNotFoundException e) {
  62. e.printStackTrace();
  63. } catch (IOException e) {
  64. e.printStackTrace();
  65. }
  66. // 图片在html文件上的路径 相对路径
  67. return "image/" + suggestedName;
  68. }
  69. });
  70. // 解析word文档
  71. wordToHtmlConverter.processDocument(wordDocument);
  72. Document htmlDocument = wordToHtmlConverter.getDocument();
  73. // 生成html文件上级文件夹
  74. File folder = new File(htmlPath);
  75. if (!folder.exists()) {
  76. folder.mkdirs();
  77. }
  78. OutputStream outStream = new FileOutputStream(htmlFile);
  79. DOMSource domSource = new DOMSource(htmlDocument);
  80. StreamResult streamResult = new StreamResult(outStream);
  81. TransformerFactory factory = TransformerFactory.newInstance();
  82. Transformer serializer = factory.newTransformer();
  83. serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
  84. serializer.setOutputProperty(OutputKeys.INDENT, "yes");
  85. serializer.setOutputProperty(OutputKeys.METHOD, "html");
  86. serializer.transform(domSource, streamResult);
  87. return htmlFile.getAbsolutePath();
  88. }
  89. /**
  90. *
  91. * 2007版本word转换成html
  92. *
  93. * @param wordPath word文件路径
  94. * @param wordName word文件名称无后缀
  95. * @param suffix word文件后缀
  96. * @param htmlPath html存储地址
  97. * @return
  98. * @throws IOException
  99. */
  100. public static String Word2007ToHtml(String wordPath, String wordName, String suffix, String htmlPath)
  101. throws IOException {
  102. String htmlName = wordName + ".html";
  103. String imagePath = htmlPath + "image" + File.separator;
  104. // 判断html文件是否存在
  105. File htmlFile = new File(htmlPath + htmlName);
  106. if (htmlFile.exists()) {
  107. return htmlFile.getAbsolutePath();
  108. }
  109. // word文件
  110. File wordFile = new File(wordPath + File.separator + wordName + suffix);
  111. // 1) 加载word文档生成 XWPFDocument对象
  112. InputStream in = new FileInputStream(wordFile);
  113. XWPFDocument document = new XWPFDocument(in);
  114. // 2) 解析 XHTML配置 (这里设置IURIResolver来设置图片存放的目录)
  115. File imgFolder = new File(imagePath);
  116. XHTMLOptions options = XHTMLOptions.create();
  117. options.setExtractor(new FileImageExtractor(imgFolder));
  118. // html中图片的路径 相对路径
  119. options.URIResolver(new BasicURIResolver("image"));
  120. options.setIgnoreStylesIfUnused(false);
  121. options.setFragment(true);
  122. // 3) 将 XWPFDocument转换成XHTML
  123. // 生成html文件上级文件夹
  124. File folder = new File(htmlPath);
  125. if (!folder.exists()) {
  126. folder.mkdirs();
  127. }
  128. OutputStream out = new FileOutputStream(htmlFile);
  129. XHTMLConverter.getInstance().convert(document, out, options);
  130. return htmlFile.getAbsolutePath();
  131. }
  132. public static void main(String[] args) {
  133. try {
  134. //Word2003ToHtml("E:\\templates", "2019-11-26", ".doc", "E://templates/");
  135. Word2007ToHtml("E:\\templates", "OnLineWord", ".doc", "E://templates/");
  136. } catch (Exception e) {
  137. e.printStackTrace();
  138. }
  139. }
  140. }

benr本人业务定制版本:

  1. package com.mmxpw.mmw.file.view.word;
  2. import org.apache.commons.fileupload.FileItem;
  3. import org.apache.poi.hwpf.HWPFDocument;
  4. import org.apache.poi.hwpf.converter.PicturesManager;
  5. import org.apache.poi.hwpf.converter.WordToHtmlConverter;
  6. import org.apache.poi.hwpf.usermodel.PictureType;
  7. import org.apache.poi.xwpf.converter.core.BasicURIResolver;
  8. import org.apache.poi.xwpf.converter.core.FileImageExtractor;
  9. import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
  10. import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
  11. import org.apache.poi.xwpf.usermodel.XWPFDocument;
  12. import org.w3c.dom.Document;
  13. import javax.xml.parsers.DocumentBuilderFactory;
  14. import javax.xml.parsers.ParserConfigurationException;
  15. import javax.xml.transform.OutputKeys;
  16. import javax.xml.transform.Transformer;
  17. import javax.xml.transform.TransformerException;
  18. import javax.xml.transform.TransformerFactory;
  19. import javax.xml.transform.dom.DOMSource;
  20. import javax.xml.transform.stream.StreamResult;
  21. import java.io.*;
  22. import java.util.ArrayList;
  23. import java.util.List;
  24. /**
  25. * @Class WordToHtmlNew
  26. * @Version 1.0
  27. * @Date 创建时间:2019-12-02 09:29
  28. * @Direction 类说明 传入文件,自动识别2003版本或者07版本,然后解析成HTML,且自动解析里面的图片,然后对图片做本地化存储,然后返回HTML
  29. */
  30. public class WordToHtmlNew {
  31. public final static List<String> FILE_TYPE = new ArrayList<>() ;
  32. static{
  33. FILE_TYPE.add( "doc" ) ;
  34. FILE_TYPE.add( "docx" ) ;
  35. }
  36. /***
  37. * 文件格式处理
  38. * @param fileItem
  39. * @return
  40. * @throws Exception
  41. */
  42. public static boolean fileTypeValidate( File fileItem ) throws Exception{
  43. String[] namePix = fileItem.getName().split("\\u002E");
  44. String suffix = namePix[namePix.length-1] ;
  45. if( !FILE_TYPE.contains( suffix ) ){
  46. throw new Exception( "您导入的文件格式错误,请导入word类型的文件." ) ;
  47. }else{
  48. return true ;
  49. }
  50. }
  51. /****
  52. * 判断文档的类型 本方法判断的是是否是新版
  53. * @param fileItem
  54. * @return
  55. */
  56. public static boolean isDocx( File fileItem ){
  57. String[] namePix = fileItem.getName().split("\\u002E");
  58. String suffix = namePix[namePix.length-1] ;
  59. if( suffix.toLowerCase().equals( "docx" )){
  60. return true ;
  61. }else{
  62. return false ;
  63. }
  64. }
  65. /****
  66. * 自动识别版本
  67. * 自动生成word内部的图片
  68. * 自动过滤其他的类型文件
  69. * 自动添加不通模块的图片前缀
  70. * @param file
  71. * @param iamgePrefix 在file的同级创建文件夹-image 提取word内部图片前缀为传入值:prefix
  72. * @param accessPath html访问路径的url为:accessPath
  73. * @return
  74. */
  75. public static String WordAutoToHtml( File file , String iamgePrefix , String accessPath ) throws Exception {
  76. if( file == null ){
  77. throw new Exception( "您传入的文件为空." ) ;
  78. }else{
  79. if( !file.exists() ){
  80. throw new Exception( "您传入的文件不存在." ) ;
  81. }else{
  82. if ( fileTypeValidate( file ) ){
  83. //1.1 开始提取文件名称,文件路径
  84. String path = file.getParent() ;
  85. String fileName = file.getName() ;
  86. //切割文件名里面的信息
  87. String prefix = fileName.substring( 0 , fileName.lastIndexOf(".") );
  88. String suffix = fileName.substring( fileName.lastIndexOf(".") + 1 );
  89. /*String[] namePix = fileName.split("\\u002E");
  90. String suffix = namePix[namePix.length-1] ;*/
  91. //1.2 开始做文件识别,然后指向去2007版本的docx 还是去2003版本的doc
  92. path = path + File.separator ;
  93. if( isDocx( file ) ){
  94. return Word2007ToHtml( file , path , prefix , path , iamgePrefix , accessPath ) ;
  95. }else{
  96. return Word2003ToHtml( file , path , prefix , path , iamgePrefix , accessPath ) ;
  97. }
  98. }
  99. }
  100. }
  101. return null ;
  102. }
  103. /**
  104. * 将word2003转换为html文件
  105. *
  106. * @param wordFile word文件
  107. * @param wordPath word文件路径
  108. * @param wordName word文件名称无后缀
  109. * @param htmlPath html存储地址
  110. * @param prefix 图片存储前缀
  111. * @param accessPath 图片的相对路径访问地址
  112. * @throws IOException
  113. * @throws TransformerException
  114. * @throws ParserConfigurationException
  115. */
  116. public static String Word2003ToHtml(File wordFile , String wordPath, String wordName, String htmlPath
  117. ,final String prefix ,final String accessPath)
  118. throws IOException, TransformerException, ParserConfigurationException {
  119. String htmlName = wordName + ".html";
  120. //1.1 判断html文件是否存在
  121. File htmlFile = new File(htmlPath + htmlName);
  122. if (htmlFile.exists()) {
  123. return htmlFile.getAbsolutePath();
  124. }
  125. //1.2 原word文档 - 文件路径信息
  126. //final String file = wordPath + File.separator + wordName + "." + suffix;
  127. //InputStream input = new FileInputStream( new File( wordFile ) );
  128. InputStream input = new FileInputStream( wordFile );
  129. //1.3 final String imagePath
  130. final String imagePath = wordPath + "image" ;
  131. HWPFDocument wordDocument = new HWPFDocument(input);
  132. WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
  133. DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
  134. // 设置图片存放的位置
  135. wordToHtmlConverter.setPicturesManager(new PicturesManager() {
  136. public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches,
  137. float heightInches) {
  138. File imgPath = new File(imagePath);
  139. if (!imgPath.exists()) {// 图片目录不存在则创建
  140. imgPath.mkdirs();
  141. }
  142. String imageFileName = prefix + suggestedName ;
  143. File file = new File(imagePath + File.separator + imageFileName );
  144. try {
  145. OutputStream os = new FileOutputStream(file);
  146. os.write(content);
  147. os.close();
  148. } catch (FileNotFoundException e) {
  149. e.printStackTrace();
  150. } catch (IOException e) {
  151. e.printStackTrace();
  152. }
  153. // 图片在html文件上的路径 相对路径
  154. return accessPath + imageFileName;
  155. }
  156. });
  157. // 解析word文档
  158. wordToHtmlConverter.processDocument(wordDocument);
  159. Document htmlDocument = wordToHtmlConverter.getDocument();
  160. // 生成html文件上级文件夹
  161. File folder = new File(htmlPath);
  162. if (!folder.exists()) {
  163. folder.mkdirs();
  164. }
  165. OutputStream outStream = new FileOutputStream(htmlFile);
  166. DOMSource domSource = new DOMSource(htmlDocument);
  167. StreamResult streamResult = new StreamResult(outStream);
  168. TransformerFactory factory = TransformerFactory.newInstance();
  169. Transformer serializer = factory.newTransformer();
  170. serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
  171. serializer.setOutputProperty(OutputKeys.INDENT, "yes");
  172. serializer.setOutputProperty(OutputKeys.METHOD, "html");
  173. serializer.transform(domSource, streamResult);
  174. return htmlFile.getAbsolutePath();
  175. }
  176. /**
  177. * 2007版本word转换成html
  178. *
  179. * @param wordPath word文件路径
  180. * @param wordName word文件名称无后缀
  181. * @param suffix word文件后缀
  182. * @param htmlPath html存储地址
  183. * @param prefix 图片存储前缀
  184. * @param accessPath 图片的相对路径访问地址
  185. * @return
  186. * @throws IOException
  187. */
  188. public static String Word2007ToHtml( File wordFile , String wordPath, String wordName, String htmlPath
  189. ,final String prefix ,final String accessPath)
  190. throws IOException, TransformerException, ParserConfigurationException {
  191. String htmlName = wordName + ".html";
  192. //1.1) 拼接HTML文件地址、判断html文件是否存在
  193. File htmlFile = new File(htmlPath + htmlName);
  194. if (htmlFile.exists()) {
  195. return htmlFile.getAbsolutePath();
  196. }
  197. // word文件
  198. //File wordFile = new File(wordPath + File.separator + wordName + suffix);
  199. //1.2) 加载word文档生成 XWPFDocument对象
  200. InputStream in = new FileInputStream( wordFile );
  201. XWPFDocument document = new XWPFDocument( in );
  202. //1.3) 解析 XHTML配置 (这里设置IURIResolver来设置图片存放的目录)
  203. final String imagePath = wordPath + "image" ;
  204. File imgFolder = new File( imagePath );
  205. XHTMLOptions options = XHTMLOptions.create();
  206. options.setExtractor(new FileImageExtractor(imgFolder));
  207. // html中图片的路径 相对路径
  208. options.URIResolver( new BasicURIResolver( accessPath ) );
  209. options.setIgnoreStylesIfUnused(false);
  210. options.setFragment(true);
  211. //1.4) 将 XWPFDocument转换成XHTML
  212. // 生成html文件上级文件夹
  213. File folder = new File(htmlPath);
  214. if (!folder.exists()) {
  215. folder.mkdirs();
  216. }
  217. OutputStream out = new FileOutputStream(htmlFile);
  218. XHTMLConverter.getInstance().convert(document, out, options);
  219. return htmlFile.getAbsolutePath();
  220. }
  221. public static void main(String[] args) {
  222. try {
  223. //Word2003ToHtml("E:\\templates", "2019-11-26", ".doc", "E://templates/");
  224. WordAutoToHtml( new File("E:\\templates\\OnLineWord.doc") , "image_" , "image/");
  225. //WordAutoToHtml( new File("E:\\templates\\OnLineWord.docx") , "image_" , "image/");
  226. } catch (Exception e) {
  227. e.printStackTrace();
  228. }
  229. }
  230. }

第三步:使用结果

2007版本:Word文档样式

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3lleGlhb21vZGVtbw_size_16_color_FFFFFF_t_70

实际转换结果为:图片的居中,字体等信息自动换行效果不理想。

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3lleGlhb21vZGVtbw_size_16_color_FFFFFF_t_70 1

2003版本的Word文档也有上述问题,大家使用此技术时请注意此问题。

参考来源:Java实现word转HTML

发表评论

表情:
评论列表 (有 0 条评论,42人围观)

还没有评论,来说两句吧...

相关阅读