在C#中将PDF分割成多个文件

14
我们有一个C# Windows服务,目前通过读取PDF中的二维条形码(使用第三方组件)来处理所有PDF,然后更新数据库并将文档存储在文档存储库中。
是否可以在读取条形码后切割文件并将其存储为另一个文档?
例如,如果有一个10页的文档,它应该拆分成10个不同的文件。
谢谢。

你目前正在使用任何PDF库吗? - Marko
我的理解是第三方组件仅用于检测 PDF 中的条形码。 - gyurisc
7个回答

7

我遇到了同样的问题,你可以使用itextsharp组件工具来拆分文档。

public Split(String[] args)
    {
        if (args.Length != 4) 
        {
            Console.Error.WriteLine("This tools needs 4 parameters:\njava Split srcfile destfile1 destfile2 pagenumber");
        }
        else 
        {
            try 
            {
                int pagenumber = int.Parse(args[3]);

                // we create a reader for a certain document
                PdfReader reader = new PdfReader(args[0]);
                // we retrieve the total number of pages
                int n = reader.NumberOfPages;
                Console.WriteLine("There are " + n + " pages in the original file.");

                if (pagenumber < 2 || pagenumber > n) 
                {
                    throw new DocumentException("You can't split this document at page " + pagenumber + "; there is no such page.");
                }

                // step 1: creation of a document-object
                Document document1 = new Document(reader.GetPageSizeWithRotation(1));
                Document document2 = new Document(reader.GetPageSizeWithRotation(pagenumber));
                // step 2: we create a writer that listens to the document
                PdfWriter writer1 = PdfWriter.GetInstance(document1, new FileStream(args[1], FileMode.Create));
                PdfWriter writer2 = PdfWriter.GetInstance(document2, new FileStream(args[2], FileMode.Create));
                // step 3: we open the document
                document1.Open();
                PdfContentByte cb1 = writer1.DirectContent;
                document2.Open();
                PdfContentByte cb2 = writer2.DirectContent;
                PdfImportedPage page;
                int rotation;
                int i = 0;
                // step 4: we add content
                while (i < pagenumber - 1) 
                {
                    i++;
                    document1.SetPageSize(reader.GetPageSizeWithRotation(i));
                    document1.NewPage();
                    page = writer1.GetImportedPage(reader, i);
                    rotation = reader.GetPageRotation(i);
                    if (rotation == 90 || rotation == 270) 
                    {
                        cb1.AddTemplate(page, 0, -1f, 1f, 0, 0, reader.GetPageSizeWithRotation(i).Height);
                    }
                    else 
                    {
                        cb1.AddTemplate(page, 1f, 0, 0, 1f, 0, 0);
                    }
                }
                while (i < n) 
                {
                    i++;
                    document2.SetPageSize(reader.GetPageSizeWithRotation(i));
                    document2.NewPage();
                    page = writer2.GetImportedPage(reader, i);
                    rotation = reader.GetPageRotation(i);
                    if (rotation == 90 || rotation == 270) 
                    {
                        cb2.AddTemplate(page, 0, -1f, 1f, 0, 0, reader.GetPageSizeWithRotation(i).Height);
                    }
                    else 
                    {
                        cb2.AddTemplate(page, 1f, 0, 0, 1f, 0, 0);
                    }
                    Console.WriteLine("Processed page " + i);
                }
                // step 5: we close the document
                document1.Close();
                document2.Close();
            }
            catch(Exception e) 
            {
                Console.Error.WriteLine(e.Message);
                Console.Error.WriteLine(e.StackTrace);
            }
        }

    }

3
请注意商业使用许可证。 - Arash Aghlara
根据NuGet软件包的详细信息,它是在MIT许可证下开源的:“PDFsharp是开源.NET库,可以轻松地从任何.NET语言动态创建和处理PDF文档。” - MUlferts
1
@MUlferts - 您是正确的,这个软件包可以免费用于商业用途,因为“您可以在没有任何限制的情况下将PDFsharp的源代码集成到您的应用程序中。”但这并不是因为它是开源的。如果某物是开源的,那只意味着它的源代码是开放的。 - Michael

4

2
public int ExtractPages(string sourcePdfPath, string DestinationFolder)
        {
            int p = 0;
            try
            {
                iTextSharp.text.Document document;
                iTextSharp.text.pdf.PdfReader reader = new iTextSharp.text.pdf.PdfReader(new iTextSharp.text.pdf.RandomAccessFileOrArray(sourcePdfPath), new ASCIIEncoding().GetBytes(""));
                if (!Directory.Exists(sourcePdfPath.ToLower().Replace(".pdf", "")))
                {
                    Directory.CreateDirectory(sourcePdfPath.ToLower().Replace(".pdf", ""));
                }
                else
                {
                    Directory.Delete(sourcePdfPath.ToLower().Replace(".pdf", ""), true);
                    Directory.CreateDirectory(sourcePdfPath.ToLower().Replace(".pdf", ""));
                }

                for (p = 1; p <= reader.NumberOfPages; p++)
                {
                    using (MemoryStream memoryStream = new MemoryStream())
                    {
                        document = new iTextSharp.text.Document();
                        iTextSharp.text.pdf.PdfWriter writer = iTextSharp.text.pdf.PdfWriter.GetInstance(document, memoryStream);
                        writer.SetPdfVersion(iTextSharp.text.pdf.PdfWriter.PDF_VERSION_1_2);
                        writer.CompressionLevel = iTextSharp.text.pdf.PdfStream.BEST_COMPRESSION;
                        writer.SetFullCompression();
                        document.SetPageSize(reader.GetPageSize(p));
                        document.NewPage();
                        document.Open();
                        document.AddDocListener(writer);
                        iTextSharp.text.pdf.PdfContentByte cb = writer.DirectContent;
                        iTextSharp.text.pdf.PdfImportedPage pageImport = writer.GetImportedPage(reader, p);
                        int rot = reader.GetPageRotation(p);
                        if (rot == 90 || rot == 270)
                        {
                            cb.AddTemplate(pageImport, 0, -1.0F, 1.0F, 0, 0, reader.GetPageSizeWithRotation(p).Height);
                        }
                        else
                        {
                            cb.AddTemplate(pageImport, 1.0F, 0, 0, 1.0F, 0, 0);
                        }
                        document.Close();
                        document.Dispose();
                        File.WriteAllBytes(DestinationFolder + "/" + p + ".pdf", memoryStream.ToArray());
                    }
                }
                reader.Close();
                reader.Dispose();
            }
            catch
            {
            }
            finally
            {
                GC.Collect();
            }
            return p - 1;

        }

在任何你想要的地方调用这个函数,并传递源文件夹和目标文件夹路径


很安静,但是在横向页面方面存在问题。 - Erfan

2

0
    public  void SplitPDFByBookMark(string fileName)
    {
        string sInFile = fileName;
        var pdfReader = new PdfReader(sInFile);
        try
        {
            IList<Dictionary<string, object>> bookmarks = SimpleBookmark.GetBookmark(pdfReader);

            for (int i = 0; i < bookmarks.Count; ++i)
            {
                IDictionary<string, object> BM = (IDictionary<string, object>)bookmarks[i];
                IDictionary<string, object> nextBM = i == bookmarks.Count - 1 ? null : bookmarks[i + 1];

                string startPage = BM["Page"].ToString().Split(' ')[0].ToString();
                string startPageNextBM = nextBM == null ? "" + (pdfReader.NumberOfPages + 1) : nextBM["Page"].ToString().Split(' ')[0].ToString();
                SplitByBookmark(pdfReader, int.Parse(startPage), int.Parse(startPageNextBM), bookmarks[i].Values.ToArray().GetValue(0).ToString() + ".pdf", fileName);

            }
        }
        catch (Exception ex)
        {
            throw ex;
        }
    }

    private void SplitByBookmark(PdfReader reader, int pageFrom, int PageTo, string outPutName, string inPutFileName)
    {
        Document document = new Document();
        using (var fs = new FileStream(Path.GetDirectoryName(inPutFileName) + '\\' + outPutName, System.IO.FileMode.Create))
        {
            try
            {
                using (var writer = PdfWriter.GetInstance(document, fs))
                {
                    document.Open();
                    PdfContentByte cb = writer.DirectContent;
                    //holds pdf data
                    PdfImportedPage page;
                    if (pageFrom == PageTo && pageFrom == 1)
                    {
                        document.NewPage();
                        page = writer.GetImportedPage(reader, pageFrom);
                        cb.AddTemplate(page, 0, 0);
                        pageFrom++;
                        fs.Flush();
                        document.Close();
                        fs.Close();

                    }
                    else
                    {
                        while (pageFrom < PageTo)
                        {
                            document.NewPage();
                            page = writer.GetImportedPage(reader, pageFrom);
                            cb.AddTemplate(page, 0, 0);
                            pageFrom++;
                            fs.Flush();
                            document.Close();
                            fs.Close();
                        }
                    }
                }
                //PdfWriter writer = PdfWriter.GetInstance(document, fs);

            }
            catch (Exception ex)
            {
                throw ex;
            }
        }
    }

你可以从 NuGet 安装 itextsharp 并将这段代码复制粘贴到 C# 应用程序中。调用 SplitPDFByBookMark() 方法并传递 PDF 文件名即可。这段代码将帮助你查找书签并完成分割!

谢谢Milo。 - Ruben Suardi
请纠正我,但我认为fs.close和document.close需要放在while循环之外。 - Tim F.

0

这段代码基于PDFsharp库

http://www.pdfsharp.com/PDFsharp/

如果你想按书签拆分,这里是代码。

   public static void SplitPDFByBookMark(string fileName)
    {
        string sInFile = fileName;
        PdfReader pdfReader = new PdfReader(sInFile);
        try
        {
            IList<Dictionary<string, object>> bookmarks = SimpleBookmark.GetBookmark(pdfReader);

            for (int i = 0; i < bookmarks.Count; ++i)
            {
                IDictionary<string, object> BM = (IDictionary<string, object>)bookmarks[0];
                IDictionary<string, object> nextBM = i == bookmarks.Count - 1 ? null : bookmarks[i + 1];

                string startPage = BM["Page"].ToString().Split(' ')[0].ToString();
                string startPageNextBM = nextBM == null ? "" + (pdfReader.NumberOfPages + 1) : nextBM["Page"].ToString().Split(' ')[0].ToString();
                SplitByBookmark(pdfReader, int.Parse(startPage), int.Parse(startPageNextBM), bookmarks[i].Values.ToArray().GetValue(0).ToString() + ".pdf", fileName);

            }
        }
        catch (Exception ex)
        {
            throw ex;
        }
    }
    private static void SplitByBookmark(PdfReader reader, int pageFrom, int PageTo, string outPutName, string inPutFileName)
    {
        Document document = new Document();
        FileStream fs = new System.IO.FileStream(System.IO.Path.GetDirectoryName(inPutFileName) + '\\' + outPutName, System.IO.FileMode.Create);

        try
        {

            PdfWriter writer = PdfWriter.GetInstance(document, fs);
            document.Open();
            PdfContentByte cb = writer.DirectContent;
            //holds pdf data
            PdfImportedPage page;
            if (pageFrom == PageTo && pageFrom == 1)
            {
                document.NewPage();
                page = writer.GetImportedPage(reader, pageFrom);
                cb.AddTemplate(page, 0, 0);
                pageFrom++;
                fs.Flush();
                document.Close();
                fs.Close();

            }
            else
            {
                while (pageFrom < PageTo)
                {
                    document.NewPage();
                    page = writer.GetImportedPage(reader, pageFrom);
                    cb.AddTemplate(page, 0, 0);
                    pageFrom++;
                    fs.Flush();
                    document.Close();
                    fs.Close();
                }
            }
        }
        catch (Exception ex)
        {
            throw ex;
        }
        finally
        {
            if (document.IsOpen())
                document.Close();
            if (fs != null)
                fs.Close();
        }

    }

请提供有关此解决方案的其他信息。如果要按页面拆分PDF,OP需要如何更改此代码?这个代码使用了哪个库? - andr

-1

使用 Spire PDF,轻松实现:

using Spire.Pdf;

namespace split_pdf

{

    class Program

    {

        static void Main(string[] args)

        {

            //open an existing pdf document

            PdfDocument doc = new PdfDocument(@"..\Sample3.pdf");

            //Split one PDF document to multiple files

            String pattern = "SplitDocument-{0}.pdf";

            doc.Split(pattern);

            String lastPageFileName

                = String.Format(pattern, doc.Pages.Count - 1);

            doc.Close();

        }

    }
}

这里是文档页面:SpirePdf


网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接