.docx
格式时,你运气不错。https://github.com/OfficeDev/Open-Xml-PowerTools/tree/abfbaac510d0d60e2f492503c60ef897247716cf
现在我们可以将文档内容提取为HTML,我们需要将其转换为PDF。有几个库可以将HTML转换为PDF,例如DinkToPdf是一个跨平台的封装库,基于Webkit HTML to PDF库libwkhtmltox。Docx转HTML
让我们把这一切都放在一起,下载OpenXMLSDK-PowerTools .Net Core项目并构建它(只需关注OpenXMLPowerTools.Core和OpenXMLPowerTools.Core.Example - 忽略其他项目)。
将OpenXMLPowerTools.Core.Example设置为启动项目。将一个Word文档添加到项目中(例如test.docx),并设置该docx文件的属性Copy To Output = If Newer
运行控制台项目:
static void Main(string[] args)
{
var source = Package.Open(@"test.docx");
var document = WordprocessingDocument.Open(source);
HtmlConverterSettings settings = new HtmlConverterSettings();
XElement html = HtmlConverter.ConvertToHtml(document, settings);
Console.WriteLine(html.ToString());
var writer = File.CreateText("test.html");
writer.WriteLine(html.ToString());
writer.Dispose();
Console.ReadLine();
static Uri FixUri(string brokenUri)
方法,以返回一个Uri
,并添加了用户友好的错误消息。static void Main(string[] args)
{
var fileInfo = new FileInfo(@"c:\temp\MyDocWithImages.docx");
string fullFilePath = fileInfo.FullName;
string htmlText = string.Empty;
try
{
htmlText = ParseDOCX(fileInfo);
}
catch (OpenXmlPackageException e)
{
if (e.ToString().Contains("Invalid Hyperlink"))
{
using (FileStream fs = new FileStream(fullFilePath,FileMode.OpenOrCreate, FileAccess.ReadWrite))
{
UriFixer.FixInvalidUri(fs, brokenUri => FixUri(brokenUri));
}
htmlText = ParseDOCX(fileInfo);
}
}
var writer = File.CreateText("test1.html");
writer.WriteLine(htmlText.ToString());
writer.Dispose();
}
public static Uri FixUri(string brokenUri)
{
string newURI = string.Empty;
if (brokenUri.Contains("mailto:"))
{
int mailToCount = "mailto:".Length;
brokenUri = brokenUri.Remove(0, mailToCount);
newURI = brokenUri;
}
else
{
newURI = " ";
}
return new Uri(newURI);
}
public static string ParseDOCX(FileInfo fileInfo)
{
try
{
byte[] byteArray = File.ReadAllBytes(fileInfo.FullName);
using (MemoryStream memoryStream = new MemoryStream())
{
memoryStream.Write(byteArray, 0, byteArray.Length);
using (WordprocessingDocument wDoc =
WordprocessingDocument.Open(memoryStream, true))
{
int imageCounter = 0;
var pageTitle = fileInfo.FullName;
var part = wDoc.CoreFilePropertiesPart;
if (part != null)
pageTitle = (string)part.GetXDocument()
.Descendants(DC.title)
.FirstOrDefault() ?? fileInfo.FullName;
WmlToHtmlConverterSettings settings = new WmlToHtmlConverterSettings()
{
AdditionalCss = "body { margin: 1cm auto; max-width: 20cm; padding: 0; }",
PageTitle = pageTitle,
FabricateCssClasses = true,
CssClassPrefix = "pt-",
RestrictToSupportedLanguages = false,
RestrictToSupportedNumberingFormats = false,
ImageHandler = imageInfo =>
{
++imageCounter;
string extension = imageInfo.ContentType.Split('/')[1].ToLower();
ImageFormat imageFormat = null;
if (extension == "png") imageFormat = ImageFormat.Png;
else if (extension == "gif") imageFormat = ImageFormat.Gif;
else if (extension == "bmp") imageFormat = ImageFormat.Bmp;
else if (extension == "jpeg") imageFormat = ImageFormat.Jpeg;
else if (extension == "tiff")
{
extension = "gif";
imageFormat = ImageFormat.Gif;
}
else if (extension == "x-wmf")
{
extension = "wmf";
imageFormat = ImageFormat.Wmf;
}
if (imageFormat == null) return null;
string base64 = null;
try
{
using (MemoryStream ms = new MemoryStream())
{
imageInfo.Bitmap.Save(ms, imageFormat);
var ba = ms.ToArray();
base64 = System.Convert.ToBase64String(ba);
}
}
catch (System.Runtime.InteropServices.ExternalException)
{ return null; }
ImageFormat format = imageInfo.Bitmap.RawFormat;
ImageCodecInfo codec = ImageCodecInfo.GetImageDecoders()
.First(c => c.FormatID == format.Guid);
string mimeType = codec.MimeType;
string imageSource =
string.Format("data:{0};base64,{1}", mimeType, base64);
XElement img = new XElement(Xhtml.img,
new XAttribute(NoNamespace.src, imageSource),
imageInfo.ImgStyleAttribute,
imageInfo.AltText != null ?
new XAttribute(NoNamespace.alt, imageInfo.AltText) : null);
return img;
}
};
XElement htmlElement = WmlToHtmlConverter.ConvertToHtml(wDoc, settings);
var html = new XDocument(new XDocumentType("html", null, null, null),
htmlElement);
var htmlString = html.ToString(SaveOptions.DisableFormatting);
return htmlString;
}
}
}
catch
{
return "The file is either open, please close it or contains corrupt data";
}
}
HTML转PDF
接下来我们需要将HTML传递给DinkToPdf。下载DinkToPdf(90 MB)解决方案。构建解决方案 - 所有的包都需要恢复并编译解决方案,这需要一些时间。
重要提示:
DinkToPdf库需要在项目的根目录下放置libwkhtmltox.so和libwkhtmltox.dll文件,如果您想在Linux和Windows上运行。如果需要,还有一个libwkhtmltox.dylib文件供Mac使用。
这些DLL文件位于v0.12.4文件夹中。根据您的计算机是32位还是64位,将这3个文件复制到DinkToPdf-master\DinkToPfd.TestConsoleApp\bin\Debug\netcoreapp1.1文件夹中。
重要提示2:
确保您的Docker镜像或Linux机器上安装了libgdiplus。libwkhtmltox.so库依赖于它。
将DinkToPfd.TestConsoleApp设置为启动项目,并将Program.cs文件更改为从使用Open-Xml-PowerTools保存的HTML文件中读取htmlContent,而不是Lorium Ipsom文本。
var doc = new HtmlToPdfDocument()
{
GlobalSettings = {
ColorMode = ColorMode.Color,
Orientation = Orientation.Landscape,
PaperSize = PaperKind.A4,
},
Objects = {
new ObjectSettings() {
PagesCount = true,
HtmlContent = File.ReadAllText(@"C:\TFS\Sandbox\Open-Xml-PowerTools-abfbaac510d0d60e2f492503c60ef897247716cf\ToolsTest\test1.html"),
WebSettings = { DefaultEncoding = "utf-8" },
HeaderSettings = { FontSize = 9, Right = "Page [page] of [toPage]", Line = true },
FooterSettings = { FontSize = 9, Right = "Page [page] of [toPage]" }
}
}
};
.doc
和 .docx
都转换成 PDF。我建议你自己创建一个服务,使用特定的非服务器 Windows/Microsoft 技术将 .doc 转换为 docx。doc 格式是二进制的,不适用于服务器端的办公自动化。
更新2:
Nick Chapsas发布了这个很酷的视频《在.NET中创建PDF的最简单方法》,它使用了QuestPDF,这是一款免费产品,适用于年收入低于100万美元的公司。在创建PDF时,它为您提供了一个很酷的视图(PDF的快速应用开发): https://www.youtube.com/watch?v=_M0IgtGWnvE&t=3m45s
b2xtranslator
,从专用的ZIP实现切换到System.IO.Compression
,并且修复了奇怪的命令行测试,改为使用NUnit。目前还不完美 - 正在努力让所有单元测试通过,并添加新的以覆盖更多用例/代码。如果你(或任何人)有兴趣,欢迎成为贡献者。 - KeithLibreOffice项目是一个开源跨平台的MS Office替代品。我们可以利用它的功能将doc
和docx
文件导出为PDF
。目前,LibreOffice没有官方的.NET API,因此,我们将直接与soffice
二进制文件通信。
这是一种有点“hacky”的解决方案,但我认为这是可能存在的最少量的错误和维护成本的解决方案。这种方法的另一个优点是,您不仅限于从doc
和docx
转换:您可以从LibreOffice支持的任何格式进行转换(例如odt、html、电子表格等)。
我编写了一个简单的c#
程序,使用了soffice
二进制文件。这只是一个概念证明(也是我在c#
中的第一个程序)。它支持Windows
系统,如果已安装LibreOffice包,则仅支持Linux
系统。
这是main.cs
:
using System;
using System.Collections.Generic;
using System.Text;
using System.Diagnostics;
using System.Reflection;
namespace DocToPdf
{
public class LibreOfficeFailedException : Exception
{
public LibreOfficeFailedException(int exitCode)
: base(string.Format("LibreOffice has failed with {}", exitCode))
{}
}
class Program
{
static string getLibreOfficePath() {
switch (Environment.OSVersion.Platform) {
case PlatformID.Unix:
return "/usr/bin/soffice";
case PlatformID.Win32NT:
string binaryDirectory = System.IO.Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location);
return binaryDirectory + "\\Windows\\program\\soffice.exe";
default:
throw new PlatformNotSupportedException ("Your OS is not supported");
}
}
static void Main(string[] args) {
string libreOfficePath = getLibreOfficePath();
// FIXME: file name escaping: I have not idea how to do it in .NET.
ProcessStartInfo procStartInfo = new ProcessStartInfo(libreOfficePath, string.Format("--convert-to pdf --nologo {0}", args[0]));
procStartInfo.RedirectStandardOutput = true;
procStartInfo.UseShellExecute = false;
procStartInfo.CreateNoWindow = true;
procStartInfo.WorkingDirectory = Environment.CurrentDirectory;
Process process = new Process() { StartInfo = procStartInfo, };
process.Start();
process.WaitForExit();
// Check for failed exit code.
if (process.ExitCode != 0) {
throw new LibreOfficeFailedException(process.ExitCode);
}
}
}
}
我在 Arch Linux 上使用 mono
编译并测试了它。我使用 mon 和 Linux 二进制文件运行它,也使用 wine
运行了 Windows 二进制文件。
您可以在Tests目录中找到结果:
输入文件:testdoc.doc,testdocx.docx
最近我使用 FreeSpire.Doc 实现了这个功能。免费版本有3页限制,但可以轻松地将docx文件转换为PDF,例如:
private void ConvertToPdf()
{
try
{
for (int i = 0; i < listOfDocx.Count; i++)
{
CurrentModalText = "Converting To PDF";
CurrentLoadingNum += 1;
string savePath = PdfTempStorage + i + ".pdf";
listOfPDF.Add(savePath);
Spire.Doc.Document document = new Spire.Doc.Document(listOfDocx[i], FileFormat.Auto);
document.SaveToFile(savePath, FileFormat.PDF);
}
}
catch (Exception e)
{
throw e;
}
}
后来我使用iTextSharp.pdf将这些单独的PDF拼接在一起:
public static byte[] concatAndAddContent(List<byte[]> pdfByteContent, List<MailComm> localList)
{
using (var ms = new MemoryStream())
{
using (var doc = new Document())
{
using (var copy = new PdfSmartCopy(doc, ms))
{
doc.Open();
// add checklist at the start
using (var db = new StudyContext())
{
var contentId = localList[0].ContentID;
var temp = db.MailContentTypes.Where(x => x.ContentId == contentId).ToList();
if (!temp[0].Code.Equals("LAB"))
{
pdfByteContent.Insert(0, CheckListCreation.createCheckBox(localList));
}
}
// Loop through each byte array
foreach (var p in pdfByteContent)
{
// Create a PdfReader bound to that byte array
using (var reader = new PdfReader(p))
{
// Add the entire document instead of page-by-page
copy.AddDocument(reader);
}
}
doc.Close();
}
}
// Return just before disposing
return ms.ToArray();
}
}
我不知道这是否适合您的使用情况,因为您没有说明要撰写的文件大小,但如果它们小于3页或者您可以将它们处理成小于3页,则可以将其转换为PDF格式。
如下评论所述,它也无法处理RTL语言,感谢@Aria指出。
我之前尝试过。它已经使用LibreOffice将docx转换为pdf,但它还有许多其他特性。此外,它是一个无状态的docker化API,自给自足。
the specified package is invalid. the main part is missing
在这一行
var document = WordprocessingDocument.Open(source);
test.docx
只有 1kb。为了解决这个问题,右键点击 test.docx
> 属性
,将 复制到输出目录
设置为 始终复制
即可解决此问题。string locationOfLibreOfficeSoffice = @"C:\PortableApps\LibreOfficePortable\App\libreoffice\program\soffice.exe";
var docxLocation = "MyWordDocument.docx";
var rep = new ReportGenerator(locationOfLibreOfficeSoffice);
//Convert from DOCX to PDF
test.Convert(docxLocation, Path.Combine(Path.GetDirectoryName(docxLocation), "Test-Template-out.pdf"));
//Convert from DOCX to HTML
test.Convert(docxLocation, Path.Combine(Path.GetDirectoryName(docxLocation), "Test-Template-out.html"));
如您所见,您还可以将DOCX转换为HTML。此外,您还可以在Word文档中放置占位符,然后可以使用值来“填充”它们。但这不在您提出的问题范围内,但您可以在Github的自述文件(README)中阅读有关该功能的信息。
这是对Jeremy Thompson非常有帮助的答案的补充。除了Word文档正文之外,我还想将Word文档的页眉(和页脚)转换为HTML。我不想修改Open-Xml-PowerTools,因此我修改了Jeremy示例中的Main()和ParseDOCX(),并添加了两个新函数。现在,ParseDOCX接受一个字节数组,因此原始的Word Docx没有被修改。
static void Main(string[] args)
{
var fileInfo = new FileInfo(@"c:\temp\MyDocWithImages.docx");
byte[] fileBytes = File.ReadAllBytes(fileInfo.FullName);
string htmlText = string.Empty;
string htmlHeader = string.Empty;
try
{
htmlText = ParseDOCX(fileBytes, fileInfo.Name, false);
htmlHeader = ParseDOCX(fileBytes, fileInfo.Name, true);
}
catch (OpenXmlPackageException e)
{
if (e.ToString().Contains("Invalid Hyperlink"))
{
using (FileStream fs = new FileStream(fullFilePath, FileMode.OpenOrCreate, FileAccess.ReadWrite))
{
UriFixer.FixInvalidUri(fs, brokenUri => FixUri(brokenUri));
}
htmlText = ParseDOCX(fileBytes, fileInfo.Name, false);
htmlHeader = ParseDOCX(fileBytes, fileInfo.Name, true);
}
}
var writer = File.CreateText("test1.html");
writer.WriteLine(htmlText.ToString());
writer.Dispose();
var writer2 = File.CreateText("header1.html");
writer2.WriteLine(htmlHeader.ToString());
writer2.Dispose();
}
private static string ParseDOCX(byte[] fileBytes, string filename, bool headerOnly)
{
try
{
using (MemoryStream memoryStream = new MemoryStream())
{
memoryStream.Write(fileBytes, 0, fileBytes.Length);
using (WordprocessingDocument wDoc = WordprocessingDocument.Open(memoryStream, true))
{
int imageCounter = 0;
var pageTitle = filename;
var part = wDoc.CoreFilePropertiesPart;
if (part != null)
{
pageTitle = (string)part.GetXDocument()
.Descendants(DC.title)
.FirstOrDefault() ?? filename;
}
WmlToHtmlConverterSettings settings = new WmlToHtmlConverterSettings()
{
AdditionalCss = "body { margin: 1cm auto; max-width: 20cm; padding: 0; }",
PageTitle = pageTitle,
FabricateCssClasses = true,
CssClassPrefix = "pt-",
RestrictToSupportedLanguages = false,
RestrictToSupportedNumberingFormats = false,
ImageHandler = imageInfo =>
{
++imageCounter;
string extension = imageInfo.ContentType.Split('/')[1].ToLower();
ImageFormat imageFormat = null;
if (extension == "png") imageFormat = ImageFormat.Png;
else if (extension == "gif") imageFormat = ImageFormat.Gif;
else if (extension == "bmp") imageFormat = ImageFormat.Bmp;
else if (extension == "jpeg") imageFormat = ImageFormat.Jpeg;
else if (extension == "tiff")
{
extension = "gif";
imageFormat = ImageFormat.Gif;
}
else if (extension == "x-wmf")
{
extension = "wmf";
imageFormat = ImageFormat.Wmf;
}
if (imageFormat == null) return null;
string base64 = null;
try
{
using (MemoryStream ms = new MemoryStream())
{
imageInfo.Bitmap.Save(ms, imageFormat);
var ba = ms.ToArray();
base64 = System.Convert.ToBase64String(ba);
}
}
catch (System.Runtime.InteropServices.ExternalException)
{ return null; }
ImageFormat format = imageInfo.Bitmap.RawFormat;
ImageCodecInfo codec = ImageCodecInfo.GetImageDecoders()
.First(c => c.FormatID == format.Guid);
string mimeType = codec.MimeType;
string imageSource =
string.Format("data:{0};base64,{1}", mimeType, base64);
XElement img = new XElement(Xhtml.img,
new XAttribute(NoNamespace.src, imageSource),
imageInfo.ImgStyleAttribute,
imageInfo.AltText != null ?
new XAttribute(NoNamespace.alt, imageInfo.AltText) : null);
return img;
}
};
// Put header into document body, and remove everything else
if (headerOnly)
{
MoveHeaderToDocumentBody(wDoc);
}
XElement htmlElement = WmlToHtmlConverter.ConvertToHtml(wDoc, settings);
var html = new XDocument(new XDocumentType("html", null, null, null),
htmlElement);
var htmlString = html.ToString(SaveOptions.DisableFormatting);
return htmlString;
}
}
}
catch
{
return "The file is either open, please close it or contains corrupt data";
}
}
private static void MoveHeaderToDocumentBody(WordprocessingDocument wDoc)
{
MainDocumentPart mainDocument = wDoc.MainDocumentPart;
XElement docRoot = mainDocument.GetXDocument().Root;
XElement body = docRoot.Descendants(W.body).First();
// Only handles first header. Header info: https://learn.microsoft.com/en-us/office/open-xml/how-to-replace-the-header-in-a-word-processing-document
HeaderPart header = mainDocument.HeaderParts.FirstOrDefault();
XElement headerRoot = header.GetXDocument().Root;
AddXElementToBody(headerRoot, body);
// document body will have new headers when we return from this function
return;
}
private static void AddXElementToBody(XElement sourceElement, XElement body)
{
// Clone the children nodes
List<XElement> children = sourceElement.Elements().ToList();
List<XElement> childClones = children.Select(el => new XElement(el)).ToList();
// Clone the section properties nodes
List<XElement> sections = body.Descendants(W.sectPr).ToList();
List<XElement> sectionsClones = sections.Select(el => new XElement(el)).ToList();
// clear body
body.Descendants().Remove();
// add source elements to body
foreach (var child in childClones)
{
body.Add(child);
}
// add section properties to body
foreach (var section in sectionsClones)
{
body.Add(section);
}
// get text from alternate content if needed - either choice or fallback node
XElement alternate = body.Descendants(MC.AlternateContent).FirstOrDefault();
if (alternate != null)
{
var choice = alternate.Descendants(MC.Choice).FirstOrDefault();
var fallback = alternate.Descendants(MC.Fallback).FirstOrDefault();
if (choice != null)
{
var choiceChildren = choice.Elements();
foreach(var choiceChild in choiceChildren)
{
body.Add(choiceChild);
}
}
else if (fallback != null)
{
var fallbackChildren = fallback.Elements();
foreach (var fallbackChild in fallbackChildren)
{
body.Add(fallbackChild);
}
}
}
}
你可以添加类似的方法来处理Word文档页脚。
在我的情况下,我会将HTML文件转换为图像(使用基于wkHtmlToX的Net-Core-Html-To-Image)。我使用Magick.NET-Q16-AnyCpu将页眉和正文图像组合在一起,将页眉图像放置在正文图像的顶部。
public static async Task<Stream> GetByteArrayOfDocumentAsync(string baseFilePathLocation)
{
var byteArray = File.ReadAllBytes(baseFilePathLocation);
using var stream = new MemoryStream();
stream.Write(byteArray, 0, (int) byteArray.Length);
return stream;
}
然后使用已设置我们的图形API令牌的客户端将此流上传到图形API
public static async Task<string> UploadFileAsync(HttpClient client,
string siteId,
MemoryStream stream,
string driveId,
string fileName,
string folderName = "root")
{
var result = await client.PutAsync(
$"https://graph.microsoft.com/v1.0/sites/{siteId}/drives/{driveId}/items/{folderName}:/{fileName}:/content",
new ByteArrayContent(stream.ToArray()));
var res = JsonSerializer.Deserialize<SharepointDocument>(await result.Content.ReadAsStringAsync());
return res.id;
}
public static async Task<Stream> GetPdfOfDocumentAsync(HttpClient client,
string siteId,
string driveId,
string documentId)
{
var getRequest =
await client.GetAsync(
$"https://graph.microsoft.com/v1.0/sites/{siteId}/drives/{driveId}/items/{documentId}/content?format=pdf");
return await getRequest.Content.ReadAsStreamAsync();
}
using System.Diagnostics;
namespace somenamespace;
public class LibreOfficeFailedException : Exception
{
public LibreOfficeFailedException(int exitCode) : base($"LibreOffice has failed with {exitCode}") { }
}
public static class WordToPdf
{
private static string GetLibreOfficePath() => @"C:\LibreOfficePortable\App\libreoffice\program\swriter.exe";
public static string ConvertWordFile(string file, string outputDirectory)
{
if (string.IsNullOrEmpty(file) || string.IsNullOrEmpty(outputDirectory)) throw new Exception("Invalid parameters passed to convert word function.");
if (!File.Exists(file)) throw new FileNotFoundException($"The file passed to the convert word process ({file}) could not be found.");
if (!Directory.Exists(outputDirectory)) throw new DirectoryNotFoundException($"The output folder passed to the convert word process ({outputDirectory}) does not exist.");
if (outputDirectory.EndsWith(@"\")) outputDirectory = outputDirectory[..^1];
var fileInfo = new FileInfo(file);
if (fileInfo.Extension.ToLower() == ".doc" && fileInfo.Extension.ToLower() == ".docx") throw new ArgumentOutOfRangeException($"The file type passed to the convert word process is an invalid type ({fileInfo.Extension}).");
var outputFile = outputDirectory + @"\" + Path.GetFileNameWithoutExtension(fileInfo.Name) + ".pdf";
if (File.Exists(outputFile)) File.Delete(outputFile);
var libreOfficePath = GetLibreOfficePath();
if (!File.Exists(libreOfficePath)) throw new FileNotFoundException("It seems that LibreOffice is not where it should be, please ensure the path exists.");
var procStartInfo = new ProcessStartInfo(libreOfficePath, $@"--headless --convert-to pdf:writer_pdf_Export ""{file}"" --outdir ""{outputDirectory}""")
{
RedirectStandardOutput = true,
UseShellExecute = false,
CreateNoWindow = true,
WorkingDirectory = Environment.CurrentDirectory
};
Process process = new() { StartInfo = procStartInfo };
process.Start();
process.WaitForExit();
if (process.ExitCode != 0)
throw new LibreOfficeFailedException(process.ExitCode);
if (!File.Exists(outputFile)) throw new FileNotFoundException("The convert to word process has failed to convert the file!");
return outputFile;
}
}
docx
据说是一种开放的格式(微软已经推广很长时间了),但它实际上相当糟糕——底层就是一个压缩包中的大量XML文件。doc
是二进制格式,但在过去20年中几乎没有改变,因此已经有很多解析器可供使用。Office一直是桌面应用程序,在服务器上使用成本高昂,我不可能是第一个/唯一一个提出这个要求的人。 - Keithsoffice --convert-to pdf --nologo name.docx
,然后你就会得到一个PDF文件。 - Shmuel H.