package com.af.plugins.office;

import org.apache.commons.io.FileUtils;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.xwpf.converter.core.BasicURIResolver;
import org.apache.poi.xwpf.converter.core.FileImageExtractor;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.w3c.dom.Document;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.*;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.util.List;

public class DocUtil {
    /**
     * word文档   docx格式转为html
     * @param rootPath
     * 			项目的路径
     * @param fileUrl
     * 			数据库存放文件的相对路径   (例：/file/aa.doc)
     * @return
     * 			返回html的路径
     */
    public static String docxToHtml(String rootPath,String fileUrl) throws IOException {
        //doc文档名称
        String docName = fileUrl.substring(0,fileUrl.lastIndexOf("."));
        //html文件名
        String htmlName = docName + ".html";
        //图片文件夹
        String imagePath = rootPath + "/image/"+docName;
        //html文件路径
        String htmlUrl = rootPath + htmlName;

        // 1) 加载word文档生成 XWPFDocument对象
        InputStream input = new FileInputStream(rootPath + fileUrl);
        XWPFDocument document = new XWPFDocument(input);

        // 2) 解析 XHTML配置 (这里设置URIResolver来设置图片存放的目录)
        File imgFolder = new File(imagePath);
        XHTMLOptions options = XHTMLOptions.create();
        options.setExtractor(new FileImageExtractor(imgFolder));
        // html中图片的路径 相对路径
        options.URIResolver(new BasicURIResolver("image/"+docName));
        options.setIgnoreStylesIfUnused(false);
        options.setFragment(true);

        // 3) 将 XWPFDocument转换成XHTML
        // 生成html文件上级文件夹
        File folder = new File(htmlUrl);
        if (!folder.exists()) {
            folder.mkdirs();
        }

        // 生成图片文件上级文件夹
        File folder2 = new File(imagePath);
        if (!folder2.exists()) {
            folder2.mkdirs();
        }

        File htmlFile = new File(htmlUrl);
        OutputStream out = new FileOutputStream(htmlFile);
        XHTMLConverter.getInstance().convert(document, out, options);
        return htmlUrl;
    }

    public static void main(String[] args) throws IOException {
        docToHtml("D:/a/","新建 DOC 文档.doc");
        docxToHtml("D:/a/","40-iES-CMR与奥枫营收系统对接文档.docx");
    }

    /**
     * word文档   doc格式转为html
     * @param rootPath
     * 			项目的路径
     * @param fileUrl
     * 			数据库存放文件的相对路径   (例：/file/aa.doc)
     * @return
     * 			返回html的路径
     */
    public static String docToHtml(String rootPath,String fileUrl) {
        InputStream input = null;
        //doc文档名称
        String docName = fileUrl.substring(0,fileUrl.lastIndexOf("."));
        //html文件名
        String htmlName = docName + ".html";
        //html文件路径
        String htmlUrl = rootPath + htmlName;
        //图片文件夹
        String imagePath = rootPath + "/image/"+docName+"/word/media/";

        // 生成图片文件上级文件夹
        File folder = new File(imagePath);
        if (!folder.exists()) {
            folder.mkdirs();
        }
        // 下面有很多try catch   如果感觉麻烦就直接用一个 ,catch里面变为Exception即可
        try {
            input = new FileInputStream(rootPath + fileUrl);
        } catch (FileNotFoundException e1) {
            e1.printStackTrace();
        }
        //  HWPFDocument是poi中用来读取doc文件的
        HWPFDocument wordDocument = null;
        try {
            wordDocument = new HWPFDocument(input);
        } catch (IOException e1) {
            e1.printStackTrace();
        }
        // 官网地址：http://poi.apache.org/apidocs/dev/org/apache/poi/hwpf/converter/WordToHtmlConverter.html
        // 用来将word转为html
        WordToHtmlConverter wordToHtmlConverter = null;
        try {
            wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
        } catch (ParserConfigurationException e1) {
            e1.printStackTrace();
        }
        // setPicturesManager用来处理图片
        wordToHtmlConverter.setPicturesManager(new PicturesManager(){
            public String savePicture( byte[] content,
                                       PictureType pictureType, String suggestedName,
                                       float widthInches, float heightInches ) {
                return "/image/" + docName + "/word/media/" + suggestedName;
            }
        });
        wordToHtmlConverter.processDocument(wordDocument);
        List pics = wordDocument.getPicturesTable().getAllPictures();
        for (Object o : pics) {
            Picture pic = (Picture) o;
            try {
                pic.writeImageContent(new FileOutputStream(imagePath
                        + docName+"_"+pic.suggestFullFileName()));
            } catch (IOException e) {
                e.fillInStackTrace();
            }
        }
        // 该Document接口表示整个HTML或XML文档。从概念上讲，它是文档树的根，并提供对文档数据的主要访问。
        Document htmlDocument = wordToHtmlConverter.getDocument();
        // 字节数组输出流
        ByteArrayOutputStream outStream = new ByteArrayOutputStream();
        // 以文档对象模型（DOM）树的形式充当转换源树的持有者。
        DOMSource domSource = new DOMSource(htmlDocument);
        // 充当转换结果的持有者，可以是XML，纯文本，HTML或其他形式的标记。
        StreamResult streamResult = new StreamResult(outStream);
        TransformerFactory tf = TransformerFactory.newInstance();
        // 处理来自各种源的XML，并将转换输出写入各种接收器。
        Transformer serializer = null;
        try {
            serializer = tf.newTransformer();
        } catch (TransformerConfigurationException e1) {
            e1.printStackTrace();
        }
        // 设置对转换有效的输出属性
        serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
        serializer.setOutputProperty(OutputKeys.INDENT, "yes");
        serializer.setOutputProperty(OutputKeys.METHOD, "html");
        try {
            serializer.transform(domSource, streamResult);
        } catch (TransformerException e) {
            e.printStackTrace();
        }
        try {
            outStream.close();
        } catch (IOException e1) {
            e1.printStackTrace();
        }
        String content = new String(outStream.toByteArray());
        try {
            /**
             * FileUtils.writeStringToFile(file, data, encoding),把字符串写进对应的文件中
             * file是新建的文件    data是写入的内容    encoding是编码格式
             */
            FileUtils.writeStringToFile(new File(htmlUrl),content, "utf-8");
        } catch (IOException e) {
            e.printStackTrace();
        }
        return htmlUrl;
    }
}
