当前位置: 首页 > news >正文

Java 富文本转word

前言:

本文的目的是将传入的富文本内容(html标签,图片)并且分页导出为word文档。

所使用的为docx4j

一、依赖导入

        <!-- 富文本转word --><dependency><groupId>org.docx4j</groupId><artifactId>docx4j</artifactId><version>6.1.2</version><exclusions><exclusion><artifactId>slf4j-log4j12</artifactId><groupId>org.slf4j</groupId></exclusion><exclusion><artifactId>log4j</artifactId><groupId>log4j</groupId></exclusion><exclusion><artifactId>commons-io</artifactId><groupId>commons-io</groupId></exclusion><exclusion><artifactId>commons-compress</artifactId><groupId>org.apache.commons</groupId></exclusion><exclusion><artifactId>guava</artifactId><groupId>com.google.guava</groupId></exclusion><exclusion><artifactId>mbassador</artifactId><groupId>net.engio</groupId></exclusion></exclusions></dependency><dependency><groupId>org.docx4j</groupId><artifactId>docx4j-ImportXHTML</artifactId><version>8.0.0</version></dependency><dependency><groupId>org.docx4j</groupId><artifactId>docx4j-JAXB-ReferenceImpl</artifactId><version>8.1.0</version><exclusions><exclusion><artifactId>docx4j-core</artifactId><groupId>org.docx4j</groupId></exclusion></exclusions></dependency>

二、字体文件

将字体文件上传到子项目resources的static.fonts目录中

三、工具类

import org.apache.commons.codec.binary.Base64;
import org.apache.commons.lang3.StringUtils;
import sun.misc.BASE64Decoder;import javax.crypto.Cipher;
import javax.crypto.KeyGenerator;
import javax.crypto.spec.SecretKeySpec;
import java.math.BigInteger;/** * 编码工具类 * 实现aes加密、解密 */  
public class AESEncryptUtils {public static final String aesKey = "this-is-aescrypt";private AESEncryptUtils(){throw new AssertionError();}/*** 算法 */  private static final String ALGORITHMSTR = "AES/ECB/PKCS5Padding";  public static void main(String[] args) throws Exception {System.out.println(AESEncryptUtils.aesEncrypt("html2Pdf", "this-is-aescrypt"));}public static String aesEncryptToString(String content) throws Exception {return aesEncrypt(content, aesKey);}public static String aesDecryptToString(String content) throws Exception {return aesDecrypt(content, aesKey);}/** * 将byte[]转为各种进制的字符串 * @param bytes byte[] * @param radix 可以转换进制的范围,从Character.MIN_RADIX到Character.MAX_RADIX,超出范围后变为10进制 * @return 转换后的字符串 */  public static String binary(byte[] bytes, int radix){  return new BigInteger(1, bytes).toString(radix);// 这里的1代表正数  }  /** * base 64 encode * @param bytes 待编码的byte[] * @return 编码后的base 64 code */  public static String base64Encode(byte[] bytes){  return Base64.encodeBase64String(bytes);}  /** * base 64 decode * @param base64Code 待解码的base 64 code * @return 解码后的byte[] * @throws Exception */  public static byte[] base64Decode(String base64Code) throws Exception{  return StringUtils.isEmpty(base64Code) ? null : new BASE64Decoder().decodeBuffer(base64Code);}  /*** AES加密 * @param content 待加密的内容 * @param encryptKey 加密密钥 * @return 加密后的byte[] * @throws Exception */  public static byte[] aesEncryptToBytes(String content, String encryptKey) throws Exception {  KeyGenerator kgen = KeyGenerator.getInstance("AES");  kgen.init(128);  Cipher cipher = Cipher.getInstance(ALGORITHMSTR);  cipher.init(Cipher.ENCRYPT_MODE, new SecretKeySpec(encryptKey.getBytes(), "AES"));  return cipher.doFinal(content.getBytes("utf-8"));  }  /*** AES加密为base 64 code * @param content 待加密的内容 * @param encryptKey 加密密钥 * @return 加密后的base 64 code * @throws Exception */  public static String aesEncrypt(String content, String encryptKey) throws Exception {  return base64Encode(aesEncryptToBytes(content, encryptKey));  }  /** * AES解密 * @param encryptBytes 待解密的byte[] * @param decryptKey 解密密钥 * @return 解密后的String * @throws Exception */  public static String aesDecryptByBytes(byte[] encryptBytes, String decryptKey) throws Exception {  KeyGenerator kgen = KeyGenerator.getInstance("AES");  kgen.init(128);  Cipher cipher = Cipher.getInstance(ALGORITHMSTR);  cipher.init(Cipher.DECRYPT_MODE, new SecretKeySpec(decryptKey.getBytes(), "AES"));  byte[] decryptBytes = cipher.doFinal(encryptBytes);  return new String(decryptBytes);  }  /** * 将base 64 code AES解密 * @param encryptStr 待解密的base 64 code * @param decryptKey 解密密钥 * @return 解密后的string * @throws Exception */  public static String aesDecrypt(String encryptStr, String decryptKey) throws Exception {  return StringUtils.isEmpty(encryptStr) ? null : aesDecryptByBytes(base64Decode(encryptStr), decryptKey);  }  }

import lombok.Data;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.context.annotation.Configuration;@Data
@Configuration
@ConfigurationProperties(prefix = "html.convert")
public class HtmlConvertproperties {/** 生成的文件保存路径 */private String fileSavePath;/** echarts转换后的图片保存路径 */private String echartsImgSavePath;
}
import org.docx4j.Docx4J;
import org.docx4j.convert.in.xhtml.XHTMLImporterImpl;
import org.docx4j.fonts.IdentityPlusMapper;
import org.docx4j.fonts.Mapper;
import org.docx4j.fonts.PhysicalFont;
import org.docx4j.fonts.PhysicalFonts;
import org.docx4j.jaxb.Context;
import org.docx4j.model.structure.PageSizePaper;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import org.docx4j.wml.RFonts;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Entities;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import java.io.*;
import java.net.URL;
import java.net.URLEncoder;import static cn.aotu.sss.module.sss.util.text.word.HtmlConverter.RemoveTag.*;/*** html转换工具类** 图片长宽乘积不能太大,不然会导致内存溢出** HtmlConverter* @author: huangbing* @date: 2020/8/7 2:32 下午*/
public class HtmlConverter {/*** 页面大小*/public enum PageSize {/** 大小*/LETTER("letter"),LEGAL("legal"),A3("A3"),A4("A4"),A5("A5"),B4JIS("B4JIS");PageSize(String code){this.code = code;}private String code;public String getCode() {return code;}}/*** 移除的标签*/enum RemoveTag {/** 移除的标签*/SCRIPT("script"), A("a"), LINK("link"), HREF("href");RemoveTag(String code){this.code = code;}private String code;public String getCode() {return code;}}/*** 参数类*/private static class Params {/** 默认字体库*/private final static String DEFAULT_FONT_FAMILY = "STSongStd-Light";/** 默认字体库路径*/private final static String DEFAULT_FONT_PATH = "/static/fonts/STSongStd-Light.ttf";/** 默认是否横版*/private final static boolean DEFAULT_LAND_SCAPE = false;/** 默认页面尺寸*/private final static String DEFAULT_PAGE_SIZE = PageSize.A4.getCode();/** 字体库*/private String fontFamily = DEFAULT_FONT_FAMILY;/** 字体库路径*/private String fontPath = DEFAULT_FONT_PATH;/** 页面尺寸*/private String pageSize = DEFAULT_PAGE_SIZE;/** 是否横版*/private boolean isLandScape = DEFAULT_LAND_SCAPE;/** 保存的文件的路径 */private String saveFilePath = HtmlConverter.class.getResource("/").getPath() + "output/";}private final Logger logger = LoggerFactory.getLogger(HtmlConverter.class);private Builder builder;public HtmlConverter(Builder builder) {this.builder = builder;}/*** 构建类*/public static class Builder {private Params params;public Builder() {this.params = new Params();this.params.fontFamily = Params.DEFAULT_FONT_FAMILY;this.params.fontPath = Params.DEFAULT_FONT_PATH;this.params.pageSize = Params.DEFAULT_PAGE_SIZE;this.params.isLandScape = Params.DEFAULT_LAND_SCAPE;}public Builder fontFamily(String fontFamily) {this.params.fontFamily = fontFamily;return this;}public Builder fontPath(String fontPath) {this.params.fontPath = fontPath;return this;}public Builder pageSize(String pageSize) {this.params.pageSize = pageSize;return this;}public Builder isLandScape(boolean isLandScape) {this.params.isLandScape = isLandScape;return this;}public Builder saveFilePath(String saveFilePath) {this.params.saveFilePath = saveFilePath;return this;}/*** 数据处理完毕之后处理逻辑放在构造函数里面** @return*/public HtmlConverter builder() {return new HtmlConverter(this);}}/*** 将页面保存为 docx** @param url* @param fileName* @return* @throws Exception*/public File saveUrlToDocx(String url, String fileName) throws Exception {return saveDocx(url2word(url), fileName);}/*** 将页面保存为 pdf** @param url* @param fileName* @return* @throws Exception*/public File saveUrlToPdf(String url, String fileName) throws Exception {return savePdf(url2word(url), fileName);}/*** 将页面转为 {@link WordprocessingMLPackage}** @param url* @return* @throws Exception*/public WordprocessingMLPackage url2word(String url) throws Exception {return xhtml2word(url2xhtml(url));}/*** 将 {@link WordprocessingMLPackage} 存为 docx** @param wordMLPackage* @param fileName* @return* @throws Exception*/public File saveDocx(WordprocessingMLPackage wordMLPackage, String fileName) throws Exception {File file = new File(genFilePath(fileName) + ".docx");//保存到 docx 文件wordMLPackage.save(file);if (logger.isDebugEnabled()) {logger.debug("Save to [.docx]: {}", file.getAbsolutePath());}return file;}/*** 将 {@link WordprocessingMLPackage} 存为 pdf** @param wordMLPackage* @param fileName* @return* @throws Exception*/public File savePdf(WordprocessingMLPackage wordMLPackage, String fileName) throws Exception {File file = new File(genFilePath(fileName) + ".pdf");OutputStream os = new FileOutputStream(file);Docx4J.toPDF(wordMLPackage, os);os.flush();os.close();if (logger.isDebugEnabled()) {
//            logger.debug("Save to [.pdf]: {}", file.getAbsolutePath());}return file;}/*** 将 {@link Document} 对象转为 {@link WordprocessingMLPackage}* xhtml to word** @param doc* @return* @throws Exception*/protected WordprocessingMLPackage xhtml2word(Document doc) throws Exception {//A4纸,//横版:trueWordprocessingMLPackage wordMLPackage = WordprocessingMLPackage.createPackage(PageSizePaper.valueOf(this.builder.params.pageSize), this.builder.params.isLandScape);//配置中文字体configSimSunFont(wordMLPackage);XHTMLImporterImpl xhtmlImporter = new XHTMLImporterImpl(wordMLPackage);//导入 xhtmlwordMLPackage.getMainDocumentPart().getContent().addAll(xhtmlImporter.convert(doc.html(), doc.baseUri()));return wordMLPackage;}/*** 将页面转为{@link Document}对象,xhtml 格式** @param url* @return* @throws Exception*/protected Document url2xhtml(String url) throws Exception {// 添加头部授权参数防止被过滤String token = AESEncryptUtils.aesEncryptToString("html2File");Document doc = Jsoup.connect(url).header("Authorization", token).get();if (logger.isDebugEnabled()) {
//            logger.debug("baseUri: {}", doc.baseUri());}//除去所有 scriptfor (Element script : doc.getElementsByTag(SCRIPT.getCode())) {script.remove();}//除去 a 的 onclick,href 属性for (Element a : doc.getElementsByTag(A.getCode())) {a.removeAttr("onclick");
//            a.removeAttr("href");}//将link中的地址替换为绝对地址Elements links = doc.getElementsByTag(LINK.getCode());for (Element element : links) {String href = element.absUrl(HREF.getCode());if (logger.isDebugEnabled()) {
//                logger.debug("href: {} -> {}", element.attr(HREF.getCode()), href);}element.attr(HREF.getCode(), href);}//转为 xhtml 格式doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml).escapeMode(Entities.EscapeMode.xhtml);if (logger.isDebugEnabled()) {String[] split = doc.html().split("\n");for (int c = 0; c < split.length; c++) {
//                logger.debug("line {}:\t{}", c + 1, split[c]);}}return doc;}/*** 为 {@link WordprocessingMLPackage} 配置中文字体** @param wordMLPackage* @throws Exception*/protected void configSimSunFont(WordprocessingMLPackage wordMLPackage) throws Exception {Mapper fontMapper = new IdentityPlusMapper();wordMLPackage.setFontMapper(fontMapper);//加载字体文件(解决linux环境下无中文字体问题)URL simsunUrl = this.getClass().getResource(this.builder.params.fontPath);PhysicalFonts.addPhysicalFont(simsunUrl);PhysicalFont simsunFont = PhysicalFonts.get(this.builder.params.fontFamily);fontMapper.put(this.builder.params.fontFamily, simsunFont);//设置文件默认字体RFonts rfonts = Context.getWmlObjectFactory().createRFonts();rfonts.setAsciiTheme(null);rfonts.setAscii(this.builder.params.fontFamily);wordMLPackage.getMainDocumentPart().getPropertyResolver().getDocumentDefaultRPr().setRFonts(rfonts);}/*** 直接通过HTML字符串生成Word处理包(核心修改点)*/public WordprocessingMLPackage htmlString2word(String htmlContent) throws Exception {// 解析 HTML 字符串为 Document 对象Document doc = Jsoup.parse(htmlContent);// 配置输出设置(修正后的关键步骤)doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml).escapeMode(Entities.EscapeMode.xhtml);// 清理不安全标签(复用原有逻辑)cleanHtml(doc);// 转换为 Word 处理包return xhtml2word(doc);}/*** 清理HTML标签(提取公共方法)*/private void cleanHtml(Document doc) {// 移除script标签doc.getElementsByTag(RemoveTag.SCRIPT.getCode()).remove();// 移除a标签的事件和链接属性doc.getElementsByTag(RemoveTag.A.getCode()).forEach(a -> {a.removeAttr("onclick");
//            a.removeAttr("href");});// 处理link标签的绝对路径(如需加载外部资源,可保留此逻辑)doc.getElementsByTag(RemoveTag.LINK.getCode()).forEach(link -> {String href = link.absUrl(RemoveTag.HREF.getCode());link.attr(RemoveTag.HREF.getCode(), href);});}/*** 公共文件下载处理方法*/public void handleFileDownload(File file,String displayFileName,HttpServletRequest request,HttpServletResponse response) throws Exception {// 文件名编码处理String encodedFileName = URLEncoder.encode(displayFileName, "UTF-8").replaceAll("\\+", "%20"); // 处理空格问题// 设置响应头response.setContentType("application/vnd.openxmlformats-officedocument.wordprocessingml.document");response.setHeader("Content-Disposition", "attachment; filename*=UTF-8''" + encodedFileName);response.setHeader("Content-Length", String.valueOf(file.length()));// 流传输try (BufferedInputStream bis = new BufferedInputStream(new FileInputStream(file));BufferedOutputStream bos = new BufferedOutputStream(response.getOutputStream())) {byte[] buffer = new byte[1024 * 8];int bytesRead;while ((bytesRead = bis.read(buffer)) != -1) {bos.write(buffer, 0, bytesRead);}bos.flush();}}/*** 生成文件位置** @return*/protected String genFilePath(String fileName) {return this.builder.params.saveFilePath + fileName;}public static void main(String[] args) throws Exception {
//        //输入要转换的网址
//        String url = "http://192.168.20.56:8080/viewReport";
//        new Builder().saveFilePath("/Users/huangbing/Desktop/echartsImages/")
//                     .builder()
//                     .saveUrlToDocx(url, "test");String s = "[img1] [img1] [img1]";String s1 = s.replaceAll("\\[img1\\]", "22");System.out.println(s1);}
}

四、controller

@Autowired
private HtmlConvertproperties htmlConvertproperties; // 注入配置类获取文件路径/*** 直接接收HTML富文本内容生成Word文档* @param htmlContent 富文本HTML代码(如:<p>富文本内容</p>)*/@PostMapping("/export")@Operation(summary = "导出word")@Parameter(name = "htmlContent", description = "富文本内容", required = true)public void generateWord(@RequestParam("htmlContent") String htmlContent,HttpServletRequest request,HttpServletResponse response) throws Exception {// 1. 初始化HtmlConverter(使用配置中的文件保存路径)HtmlConverter htmlConverter = new HtmlConverter.Builder().saveFilePath(htmlConvertproperties.getFileSavePath()) // 从配置获取路径.builder();// 2. 转换HTML字符串为Word处理包WordprocessingMLPackage wordMLPackage = htmlConverter.htmlString2word(htmlContent);// 3. 生成临时文件并设置响应String fileName = "report_" + System.currentTimeMillis();File tempFile = htmlConverter.saveDocx(wordMLPackage, fileName); // 调用原有保存逻辑// 4. 处理文件下载(兼容不同浏览器)htmlConverter.handleFileDownload(tempFile, "报告.docx", request, response);// 5. 清理临时文件(根据需求可选,生产环境建议异步清理或设置过期策略)tempFile.deleteOnExit();}

五、引用说明

工具类参考github上的文章,但是对于工具类中的的具体逻辑作了修改。

https://github.com/FTOLs/report-demo

六、测试

相关文章:

  • java方法引用
  • static成员
  • jQuery的removeClass(),一次删除多个class
  • 4.2 Prompt工程与任务建模:高效提示词设计与任务拆解方法
  • 【学习笔记】文件包含漏洞--相关习题
  • 全面解析 UGC 平台物品冷启动策略
  • 【Linux内核】内核中的中断管理
  • Activepieces - 开源自动化工具
  • 【动手学大模型开发】什么是大语言模型
  • 【阿里云大模型高级工程师ACP习题集】2.4 自动化评测答疑机器人的表现(⭐️⭐️⭐️ 重点章节!!!)
  • Java Collections工具类指南
  • 计算机组成与体系结构:直接内存映射(Direct Memory Mapping)
  • Spring Boot YML配置值“011“在代码中变为9的问题解析
  • leetcode 2799. 统计完全子数组的数目 中等
  • 玩转Docker | Docker部署LMS轻量级音乐工具
  • 加深对vector理解OJ题
  • MQTT 之 EMQX
  • pnpm install报错:此系统上禁止运行脚本
  • 电竞俱乐部护航点单小程序,和平地铁俱乐部点单系统,三角洲护航小程序,暗区突围俱乐部小程序
  • 1.7软考系统架构设计师:系统架构设计师概述 - 超简记忆要点、知识体系全解、考点深度解析、真题训练附答案及解析
  • 冯象|那“交出”后的崩溃,如撒旦坠落诸天
  • 特朗普签署行政命令推动深海采矿,被指无视国际规则,引发环境担忧
  • 获公示拟任省辖市委副书记的胡军,已赴南阳履新
  • 南方医科大学原副校长宁习洲主动投案,接受审查调查
  • 牛市早报|商务部:目前中美之间未进行任何经贸谈判
  • 国家发改委党组在《人民日报》发表署名文章:新时代新征程民营经济发展前景广阔大有可为