POI处理Word转HTML,读取文档内容上传图片

Java实践 专栏收录该内容
4 篇文章 0 订阅

POI处理Word转HTML,读取文档内容上传图片

     需要给官网做一个文档管理的后台服务,需要一个上传Word文档转换成HTML格式返回给前端的功能,下面和大家一起来看看,后端部分的逻辑实现。

实现逻辑

需要用到的引用:

    compile('org.apache.poi:poi:4.1.0')
    compile('org.apache.poi:poi-ooxml:4.1.0')
    compile('org.apache.poi:poi-ooxml-schemas:4.1.0')
    compile('org.apache.poi:poi-scratchpad:4.1.0')
    compile('fr.opensagres.xdocreport:xdocreport:2.0.2')
    compile('org.apache.poi:ooxml-schemas:1.4')

doc转html实现

     /**
     * 上传Word文档,返回解析后的Html
     */
    public String uploadDocFile(MultipartFile file) throws Exception{
        String filePath = this.getClass().getClassLoader().getResource("file").getPath();
        String targetFileName = filePath +"/"+ "temp.html";
        File target = new File(targetFileName);
        target.getParentFile().mkdirs();
        //将上传的文件传入Document转换
        HWPFDocument wordDocument = new HWPFDocument(file.getInputStream());
        Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
        WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(document);
        // word文档转Html文档
        wordToHtmlConverter.processDocument(wordDocument);
        Document htmlDocument = wordToHtmlConverter.getDocument();
        DOMSource domSource = new DOMSource(htmlDocument);
        StreamResult streamResult = new StreamResult(new File(targetFileName));
        //将读取到的图片上传并添加链接地址
        wordToHtmlConverter.setPicturesManager((imageStream, pictureType, name, width, height) -> {
            try {
                String imageUrl = uploadImages(imageStream);
            } catch (Exception e) {
                e.printStackTrace();
            }
            return imageUrl;
        });
        //生成html文件
        TransformerFactory tf = TransformerFactory.newInstance();
        Transformer serializer = tf.newTransformer();
        serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
        serializer.setOutputProperty(OutputKeys.INDENT, "yes");
        serializer.setOutputProperty(OutputKeys.METHOD, "html");
        serializer.transform(domSource, streamResult);
        // 读取并过滤文件格式
        String htmlContent = splitContext(targetFileName);
        // 删除生成的html文件
        File files = new File(targetFileName);
        files.delete();
        return htmlContent;
    }

    /**
     * 过滤html文件内容
     * @param filePath
     * @return
     */
    public static String splitContext(String filePath) {
        File file = new File(filePath);
        BufferedReader reader = null;
        try {
            String tempString;
            InputStreamReader isr = new InputStreamReader(new FileInputStream(file), "UTF-8");
            reader = new BufferedReader(isr);
            StringBuilder sb = new StringBuilder();
            while ((tempString = reader.readLine()) != null) {
                sb.append(tempString);
            }
            reader.close();
            // 将文件中的双引号替换为单引号
            String content = sb.toString().replaceAll("\"","\'");
            return content;
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (reader != null) {
                try {
                    reader.close();
                } catch (IOException e1) {
                }
            }
        }
        return "";
    }

docx转html实现

    /**
     * 上传docx文档,返回解析后的Html
     */
    public Stirng uploadDocXFile(MultipartFile file) throws Exception{
        // 将上传的文件传入Document转换
        XWPFDocument docxDocument = new XWPFDocument(file.getInputStream());
        XHTMLOptions options = XHTMLOptions.create();
        // 设置图片存储路径
        String path = System.getProperty("java.io.tmpdir");
        String firstImagePathStr = path + "/" + System.currentTimeMillis();
        options.setExtractor(new FileImageExtractor(new File(firstImagePathStr)));
        options.URIResolver(new BasicURIResolver(firstImagePathStr));
        // 转换html
        ByteArrayOutputStream htmlStream = new ByteArrayOutputStream();
        XHTMLConverter.getInstance().convert(docxDocument, htmlStream, options);
        String htmlStr = htmlStream.toString();
        // 将image文件转换为base64并替换到html字符串里
        String middleImageDirStr = "/word/media";
        String imageDirStr = firstImagePathStr + middleImageDirStr;
        File imageDir = new File(imageDirStr);
        String[] imageList = imageDir.list();
        String downloadPath;
        if (imageList != null) {
            for (int i = 0; i < imageList.length; i++) {
                String oneImagePathStr = imageDirStr + "/" + imageList[i];
                MultipartFile multipartFile = getMulFileByPath(oneImagePathStr);
                String imageUrl = uploadImages(multipartFile);
                // 也可以直接转成Base64格式处理,如下:
                // String imageBase64Str = new String(Base64.encodeBase64(FileUtils.readFileToByteArray(oneImageFile)), "UTF-8");
                //修改文档中的图片信息
                htmlStr = htmlStr.replace(oneImagePathStr, imageUrl);
            }
        }
        //删除图片路径
        File firstImagePath = new File(firstImagePathStr);
        FileUtils.deleteDirectory(firstImagePath);
        return  htmlStr;
    }

    /**
     * 获取MultipartFile文件
     * @param picPath
     * @return
     */
    private static MultipartFile getMulFileByPath(String picPath) {
        FileItem fileItem = createFileItem(picPath);
        MultipartFile mfile = new CommonsMultipartFile(fileItem);
        return mfile;
    }
    private static FileItem createFileItem(String filePath)
    {
        FileItemFactory factory = new DiskFileItemFactory(16, null);
        String textFieldName = "textField";
        int num = filePath.lastIndexOf(".");
        String extFile = filePath.substring(num);
        FileItem item = factory.createItem(textFieldName, "text/plain", true,
                "MyFileName" + extFile);
        File newfile = new File(filePath);
        int bytesRead = 0;
        byte[] buffer = new byte[8192];
        try
        {
            FileInputStream fis = new FileInputStream(newfile);
            OutputStream os = item.getOutputStream();
            while ((bytesRead = fis.read(buffer, 0, 8192))
                    != -1)
            {
                os.write(buffer, 0, bytesRead);
            }
            os.close();
            fis.close();
        }
        catch (IOException e)
        {
            e.printStackTrace();
        }
        return item;
    }

小结

     实现主要依赖了POI包,将文件流传入document转换html,读取html文件中的图片进行上传处理,并将上传后的链接补充到文档中。

  • 0
    点赞
  • 2
    评论
  • 7
    收藏
  • 一键三连
    一键三连
  • 扫一扫,分享海报

相关推荐
©️2020 CSDN 皮肤主题: 程序猿惹谁了 设计师:白松林 返回首页
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、C币套餐、付费专栏及课程。

余额充值