关于docx，html，xhtml,pdf直接转换

近期由于项目的原因接触到word转html,pdf转html，之后在线编辑的模块，在网上找了许多资料，经过整理测试，已初具规模

首先doc(docx)在线编辑

1 推荐使用：zohowriter,无插件的web word编辑器

2 推荐使用：docx4J 可以先把docx文档转换为html，

package com.zoma.common;

import java.io.BufferedReader;

import java.io.ByteArrayInputStream;

import java.io.ByteArrayOutputStream;

import java.io.DataOutputStream;

import java.io.File;

import java.io.FileInputStream;

import java.io.FileNotFoundException;

import java.io.FileOutputStream;

import java.io.FileReader;

import java.io.FileWriter;

import java.io.IOException;

import java.io.OutputStreamWriter;

import java.io.PrintWriter;

import org.docx4j.XmlUtils;

import org.docx4j.convert.in.xhtml.XHTMLImporter;

import org.docx4j.convert.out.html.HTMLConversionImageHandler;

import org.docx4j.convert.out.html.HtmlExporterNonXSLT;

import org.docx4j.openpackaging.exceptions.Docx4JException;

import org.docx4j.openpackaging.packages.WordprocessingMLPackage;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;

import org.w3c.tidy.Tidy;

public class DocxUtil {

/**

* docx文档转换为html

* @param filepath –docx 文件路径（f:/1.docx）

* @param outpath–生成html路径(f:1.html)

* @param imgpath–图片保存路径（f/img）

* @param imguri–图片引用（img/）

* @return 转换成功返回true,失败返回false

public static boolean docToHtml(String filepath,String outpath,String imgpath,String imguri)

{

boolean bo = false ;

FileWriter fw = null;

try {

File infile = new File(filepath);

File outfile = new File(outpath);

WordprocessingMLPackage wmp=WordprocessingMLPackage.load(infile);

HtmlExporterNonXSLT hn=new HtmlExporterNonXSLT(wmp, new HTMLConversionImageHandler(imgpath,imguri, true));

String html=(XmlUtils.w3CDomNodeToString(hn.export()));

fw=new FileWriter(outfile);

fw.write(html);

} catch (Exception e) {

e.printStackTrace();

return bo ;

}finally{

try {

fw.close();

} catch (IOException e) {

e.printStackTrace();

}

System.gc();

}

return bo ;

}

/**

* html转换为xhtml

* @param f_in –docx 文件路径（f:/1.html）

* @param outfile–生成xhtml路径(f:1.xhtml)

* @return 转换成功返回true,失败返回false

public static boolean parseToXhtml(String f_in,String outfile)

{

boolean bo = false;

//BufferedInputStream sourceIn; //输入流

ByteArrayOutputStream tidyOutStream = null; //输出流

FileInputStream fis = null;

ByteArrayOutputStream bos = null;

ByteArrayInputStream stream = null;

DataOutputStream to = null;

try

{

// Reader reader;

fis = new FileInputStream(f_in); //读文件

bos = new ByteArrayOutputStream();

int ch;

while((ch=fis.read())!=-1)

{

bos.write(ch);

}

byte[] bs = bos.toByteArray();

bos.close();

String hope_gb2312=new String(bs,”UTF-8″);//注意，默认是GB2312，所以这里先转化成GB2312然后再转化成其他的。

byte[] hope_b=hope_gb2312.getBytes();

String basil=new String(hope_b,”UTF-8″);//将GB2312转化成 UTF-8

// byte[] basil_b=basil.getBytes();

stream= new ByteArrayInputStream(basil.getBytes());

tidyOutStream = new ByteArrayOutputStream();

Tidy tidy = new Tidy();

tidy.setInputEncoding(“UTF-8”);

tidy.setQuiet(true);

tidy.setOutputEncoding(“UTF-8”);

tidy.setShowWarnings(true); //不显示警告信息

tidy.setIndentContent(true);//

tidy.setSmartIndent(true);

tidy.setIndentAttributes(false);

tidy.setWraplen(1024); //多长换行

//输出为xhtml

tidy.setXHTML(true);

tidy.setErrout(new PrintWriter(System.out));

tidy.parse(stream, tidyOutStream);

to=new DataOutputStream(new FileOutputStream(outfile)); //将生成的xhtml写入

tidyOutStream.writeTo(to);

System.out.println(tidyOutStream.toString());

bo = true ;

}

catch ( Exception ex )

{

System.out.println( ex.toString());

ex.printStackTrace();

return bo ;

}finally{

try {

if(to!=null)

{

to.close();

}

if(stream !=null)

{

stream.close();

}

if(fis !=null)

{

fis.close();

}

if(bos !=null)

{

bos.close();

}

if(tidyOutStream !=null)

{

tidyOutStream.close();

}

} catch (IOException e) {

e.printStackTrace();

}

System.gc();

}

return bo;

}

/**

* 获取html内容

* @param filepath 文件路径（f:/1.xhtml）

* @param exps 搜索表达式(html元素标签等)

* @return 搜索内容

public static String getHtmlStyle(String filepath,String exps)

{

String str=””;

try {

File input = new File(filepath);

Document doc = Jsoup.parse(input, “UTF-8”);

Elements els = null;

if(exps.equals(“body”))

{

els= doc.body().children();

}else{

els= doc.select(exps);

}

for(Element el :els)

{

str+=el;

}

} catch (IOException e) {

e.printStackTrace();

return str;

}finally{

System.gc();

}

return str;

}

/**

* 修改html内容

* @param filepath

* @param exps

* @param htmls

* @return

public static boolean modifyHtml(String filepath,String exps,String htmls)

{

boolean bo = false ;

try {

File input = new File(filepath);

Document doc = Jsoup.parse(input, “UTF-8”);

if(exps.equals(“body”))

{

Element el = doc.body();

el.html(“”);

//el.children().html(htmls);

//el.html(htmls);

}

//Elements els = doc.select(exps);

//els.html(htmls);

bo = true;

} catch (IOException e) {

e.printStackTrace();

return bo ;

}

return bo ;

}

public static boolean modifyBody(String infile,String content,String outfile)

{

File file = new File(infile);

BufferedReader reader = null;

FileWriter writer =null;

String fileStr=”” ;

try {

System.out.println(“以行为单位读取文件内容，一次读一整行：”);

reader = new BufferedReader(new FileReader(file));

String tempString = null;

// 一次读入一行，直到读入null为文件结束

while ((tempString = reader.readLine()) != null) {

// 显示行号

fileStr+=tempString;

}

String newStr = fileStr.substring(0,fileStr.indexOf(“<body>”));

newStr+=content;

newStr+=fileStr.substring(fileStr.indexOf(“</body>”),fileStr.length());

//打开一个写文件器，构造函数中的第二个参数true表示以追加形式写文件

writer = new FileWriter(outfile, true);

writer.write(newStr);

} catch (IOException e) {

e.printStackTrace();

} finally {

try {

if(writer !=null)

{

writer.close();

}

} catch (IOException e) {

e.printStackTrace();

}

if (reader != null) {

try {

reader.close();

} catch (IOException e1) {

}

System.gc();

}

return false ;

}

public static boolean modifyHead(String infile,String content,String outfile)

{

File file = new File(infile);

BufferedReader reader = null;

String fileStr=”” ;

try {

System.out.println(“以行为单位读取文件内容，一次读一整行：”);

reader = new BufferedReader(new FileReader(file));

String tempString = null;

// 一次读入一行，直到读入null为文件结束

while ((tempString = reader.readLine()) != null) {

// 显示行号

fileStr+=tempString;

}

reader.close();

String newStr = fileStr.substring(0,fileStr.indexOf(“<head>”));

newStr+=”<head>”;

newStr+=”<meta http-equiv=’Content-Type’ content=’text/html; charset=UTF-8′ />”;

newStr+=content;

newStr+=fileStr.substring(fileStr.indexOf(“</head>”),fileStr.length());

//打开一个写文件器，构造函数中的第二个参数true表示以追加形式写文件

FileWriter writer = new FileWriter(outfile, true);

writer.write(newStr);

writer.close();

} catch (IOException e) {

e.printStackTrace();

} finally {

if (reader != null) {

try {

reader.close();

} catch (IOException e1) {

}

return false ;

}

/**

* xhtml转换为docx文档

* @param infile xhtml路径（f:/1.xhtml）

* @param outfile docx生成路径(f:/1.docx)

* @return

public static boolean xhtmlToDocx(String infile,String outfile)

{

boolean bo = false;

try {

WordprocessingMLPackage wxm=WordprocessingMLPackage.createPackage();

wxm.getMainDocumentPart().getContent().addAll(XHTMLImporter.convert(new File(infile),null, wxm));

wxm.save(new File(outfile));

} catch (Docx4JException e) {

e.printStackTrace();

return bo ;

} finally{

System.gc();

}

return bo ;

}

3 pdf 转html 推荐pdf2htmlex 高保真转化

需要ubutun 12.04以上版本并且安装一下软件

3.1 apt-get install python-software-properties

3.2 sudo add-apt-repository ppa:coolwanglu/pdf2htmlex

3.3 sudo apt-get update

3.4 sudo apt-get install fontforge

3.5 sudo aptitude install poppler-utils

3.6 sudo apt-get install pdf2htmlex

测试输入:pdf2htmlEX –zoom 1.3 /home/1.pdf –dest-dir /home/1

会在home/1文件夹下生成html文件

4 pdf合并使用pdfbox类库

/**

* @param savepath 原来文件夹路径

* @param filePath合并后名字，，临时PDF文件夹

* 生成新的pdf文件后，删除原有pdf

* @return

* @throws COSVisitorException

* @throws IOException

public static String mergePdfFiles(String savepath, String filePath) throws COSVisitorException, IOException

{

PDFMergerUtility mergePdf = new PDFMergerUtility();

List list = new ArrayList();

File dir = new File(savepath);

System.out.println(“————merge savepath–“+savepath);

System.out.println(“————merge dir–“+dir.getAbsolutePath());

System.out.println(“————merge to file–“+filePath);

File file[] = dir.listFiles();

for (int i = 0; i < file.length; i++) {

if (file[i].isFile())

{

list.add(file[i]);

}

System.out.println(“————–file——–list—————————-“+list.size());

for(int i=0;i<list.size();i++)

{

File f = (File) list.get(i);

InputStream is= new FileInputStream(f);

mergePdf.addSource(is);

}

mergePdf.setDestinationFileName(filePath);

mergePdf.mergeDocuments();

for(int i=0;i<list.size();i++)

{

File f = (File) list.get(i);

f.deleteOnExit();

}

return filePath;

}

5 jpg图片合并转pdf

首先ubutun安装convert软件

apt-get install imagemagick

apt-get install graphicsmagick-imagemagick-compat

使用命令convert /usr/*.jpg /usr/1.pdf

6java程序调研linux命令

Process proc = Runtime.getRuntime().exec(“”);

本文出自 “开心的傻瓜” 博客，请务必保留此出处

做事不怕难，自无难人事。

相关文章：

你感兴趣的文章：

标签云：