利用C#从pdf文档中批量提取图片和文本
1、创建VS项目
2、编写提取图片的方法,代码如下:
private void ExtractImage(string pdfFile)
{
PdfReader pdfReader = new PdfReader(pdfFile);
for (int pageNumber = 1; pageNumber <= pdfReader.NumberOfPages; pageNumber++)
{
PdfReader pdf = new PdfReader(pdfFile);
PdfDictionary pg = pdf.GetPageN(pageNumber);
PdfDictionary res = (PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES));
PdfDictionary xobj = (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));
try
{
foreach (PdfName name in xobj.Keys)
{
PdfObject bj = xobj.Get(name);
if (obj.IsIndirect())
{
PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj);
string width = tg.Get(PdfName.WIDTH).ToString();
string height = tg.Get(PdfName.HEIGHT).ToString();
//ImageRenderInfo imgRI = ImageRenderInfo.CreateForXObject((GraphicsState)new Matrix(float.Parse(width), float.Parse(height)), (PRIndirectReference)obj, tg);
ImageRenderInfo imgRI = ImageRenderInfo.CreateForXObject(new GraphicsState(), (PRIndirectReference)obj, tg);
RenderImage(imgRI);
}
}
}catch
{
continue;
}
}
}
3、将图片保存到文件
private void RenderImage(ImageRenderInfo renderInfo)
{
count++;
PdfImageObject image = renderInfo.GetImage();
using (Dotnet dotnetImg = image.GetDrawingImage())
{
if (dotnetImg != null)
{
using (MemoryStream ms = new MemoryStream())
{
dotnetImg.Save(ms, ImageFormat.Tiff);
Bitmap d = new Bitmap(dotnetImg);
d.Save(@"");
}
}
}
}
4、从PDF提取文本
public void ExtractTextFromPDFPage(string pdfFile)
{
PdfReader reader = new PdfReader(pdfFile);
int n = reader.NumberOfPages;
for (int i = 1; i <= n; i++)
{
string text = PdfTextExtractor.GetTextFromPage(reader, i);
}
try { reader.Close(); }
catch { }
}