.Net iTextSharp获取关键字坐标并替换相关区域文本

image

 

最近遇到pdf修改报告结论的需求,也就是把上图结论部分替换成新的文本,我们可以根据关键字"结论"和"报告医生"获取这块区域的位置,同时还要支持文本相关字符下转义换行

全部代码

using iTextSharp.text;
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
using Newtonsoft.Json;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Web.Core.Log;
using Web.Core.Orm;

namespace Web.Core.Helper
{

    public class PdfHelper
    {
        public static void ReplaceConclusion(string inputPdfPath, string outputPdfPath, string newConclusionText)
        {
            // 处理转义字符
            string processedText = ProcessEscapeCharacters(newConclusionText);

            using (PdfReader reader = new PdfReader(inputPdfPath))
            {
                // 先分析页面,找到关键位置
                var pageInfo = AnalyzePage(reader, 1);

                using (FileStream fs = new FileStream(outputPdfPath, FileMode.Create))
                {
                    using (PdfStamper stamper = new PdfStamper(reader, fs))
                    {
                        PdfContentByte canvas = stamper.GetOverContent(1);

                        // 使用分析得到的区域坐标
                        if (pageInfo.ConclusionArea != null)
                        {
                            // 覆盖结论区域
                            canvas.SetColorFill(BaseColor.BLUE);
                            canvas.Rectangle(
                                pageInfo.ConclusionArea.Left,
                                pageInfo.ConclusionArea.Bottom,
                                pageInfo.ConclusionArea.Width,
                                pageInfo.ConclusionArea.Height
                            );
                            canvas.Fill();

                            // 写入新文本
                            WriteTextInArea(canvas, pageInfo.ConclusionArea, processedText);
                        }
                        else
                        {
                            // 如果找不到结论区域,使用默认坐标
                            throw new Exception("未获取到报告结论");
                        }
                    }
                }
            }
        }

        private static PageInfo AnalyzePage(PdfReader reader, int pageNumber)
        {
            var strategy = new TextLocationExtractionStrategy();
            PdfTextExtractor.GetTextFromPage(reader, pageNumber, strategy);

            var pageInfo = new PageInfo();
            // 获取合并后的文本块
            var mergedTextChunks = strategy.GetMergedTextChunks();

            // 在合并后的文本块中查找关键词
            foreach (var textChunk in mergedTextChunks)
            {
                string text = textChunk.Text;
                if (text.Contains("结论"))
                {
                    pageInfo.ConclusionTitle = textChunk;
                }
                else if (text.Contains("报告者"))
                {
                    pageInfo.Reporter = textChunk;
                }
                else if (text.Contains("报告医生"))
                {
                    if (pageInfo.Reporter == null)
                    {
                        pageInfo.Reporter = textChunk;
                    }
                }
            }

            LogHelper.Info($"AnalyzePage pageInfo {JsonConvert.SerializeObject(pageInfo)}");

            // 计算结论区域
            if (pageInfo.ConclusionTitle != null && pageInfo.Reporter != null)
            {
                pageInfo.ConclusionArea = new TextArea
                {
                    Left = pageInfo.ConclusionTitle.Position.Left - 3,
                    Top = pageInfo.ConclusionTitle.Position.Top - 3, // 结论标题下方
                    Right = 568, // 右边距
                    Bottom = pageInfo.Reporter.Position.Top + 20 // 报告者上方
                };
            }

            return pageInfo;
        }
        private static void ProcessLineWithTabs(
    PdfContentByte canvas,
    string line,
    float startX,
    float currentY,
    float tabWidth)
        {
            // 按制表符分割行
            string[] segments = line.Split('\t');

            // 当前X位置
            float currentX = startX;

            for (int i = 0; i < segments.Length; i++)
            {
                // 设置当前位置并显示文本段
                canvas.SetTextMatrix(currentX, currentY);
                canvas.ShowText(segments[i]);

                // 如果不是最后一个段,则计算下一个制表符位置
                if (i < segments.Length - 1)
                {
                    // 移动到下一个制表位
                    currentX = ((int)((currentX + GetTextWidth(segments[i])) / tabWidth) + 1) * tabWidth;
                }
            }
        }
        // 获取文本宽度(近似计算)
        private static float GetTextWidth(string text)
        {
            // 简单估算:每个字符大约6个单位宽度(根据字体大小10pt)
            // 这是一个近似值,如果需要精确计算,可以使用BaseFont的GetWidthPoint方法
            return text.Length * 6;
        }
        private static void WriteTextInArea(PdfContentByte canvas, TextArea area, string text)
        {
            canvas.BeginText();
            BaseFont baseFont = GetChineseFont();
            canvas.SetFontAndSize(baseFont, 9);
            canvas.SetColorFill(BaseColor.WHITE);

            float lineHeight = 12;
            float currentY = area.Top - lineHeight;

            string[] lines = text.Split(new[] { "\r\n", "\r", "\n" }, StringSplitOptions.None);
            float tabWidth = 40;  // 制表符宽度(可根据需要调整)

            foreach (var line in lines)
            {
                ProcessLineWithTabs(canvas, line, area.Left, currentY, tabWidth);
                currentY -= lineHeight;
                // 如果超出区域则停止
                if (currentY < area.Bottom)
                    break;
            }

            canvas.EndText();
        }

        private static string ProcessEscapeCharacters(string text)
        {
            return text
                .Replace("\\n", "\n")
                .Replace("\\r", "\r")
                .Replace("\\t", "\t")
                .Replace("\\\\", "\\");
        }

        private static BaseFont GetChineseFont()
        {
            try
            {
                // 尝试多个常见中文字体
                string[] fontPaths =
                {
                @"C:\Windows\Fonts\msyh.ttc,0",    // 微软雅黑
                @"C:\Windows\Fonts\simsun.ttc,0",  // 宋体
                @"C:\Windows\Fonts\simhei.ttf",    // 黑体
            };

                foreach (var fontPath in fontPaths)
                {
                    if (File.Exists(fontPath.Replace(",0", "").Replace(",1", "")))
                    {
                        return BaseFont.CreateFont(fontPath, BaseFont.IDENTITY_H, BaseFont.EMBEDDED);
                    }
                }

                return BaseFont.CreateFont(BaseFont.HELVETICA, BaseFont.CP1252, BaseFont.EMBEDDED);
            }
            catch
            {
                return BaseFont.CreateFont(BaseFont.HELVETICA, BaseFont.CP1252, BaseFont.EMBEDDED);
            }
        }
    }

    // 支持类
    public class PageInfo
    {
        public TextChunk ConclusionTitle { get; set; }
        public TextChunk Reporter { get; set; }
        public TextChunk Diagnosis { get; set; }
        public TextArea ConclusionArea { get; set; }
    }

    public class TextChunk
    {
        public string Text { get; set; }
        public TextPosition Position { get; set; }
    }

    public class TextPosition
    {
        public float Left { get; set; }
        public float Top { get; set; }
        public float Right { get; set; }
        public float Bottom { get; set; }
    }
    public class TextArea
    {
        public float Left { get; set; }
        public float Top { get; set; }
        public float Right { get; set; }
        public float Bottom { get; set; }

        public float Width => Right - Left;
        public float Height => Top - Bottom;
    }
    public class TextLocationExtractionStrategy : ITextExtractionStrategy
    {
        private List<TextChunk> _textChunks = new List<TextChunk>();
        private const float TOLERANCE = 5f; // 合并文本块的容差范围

        public void RenderText(TextRenderInfo renderInfo)
        {
            Vector baseline = renderInfo.GetBaseline().GetStartPoint();
            string text = renderInfo.GetText();

            // 获取文本的边界框
            Vector ascentLine = renderInfo.GetAscentLine().GetStartPoint();
            Vector descentLine = renderInfo.GetDescentLine().GetStartPoint();

            // 估算文本宽度(简单的估算方法)
            float textWidth = renderInfo.GetSingleSpaceWidth() * text.Length;

            _textChunks.Add(new TextChunk
            {
                Text = text,
                Position = new TextPosition
                {
                    Left = baseline[Vector.I1],
                    Top = baseline[Vector.I2],
                    Right = baseline[Vector.I1] + textWidth,
                    Bottom = descentLine[Vector.I2]
                }
            });
        }

        // 合并相邻的文本块
        public List<TextChunk> GetMergedTextChunks()
        {
            if (_textChunks.Count == 0)
                return new List<TextChunk>();

            // 按Y坐标分组(同一行的文本)
            var lines = new Dictionary<int, List<TextChunk>>();

            foreach (var chunk in _textChunks)
            {
                // 将Y坐标取整,以便将相近高度的文本归为一行
                int yKey = (int)Math.Round(chunk.Position.Top / TOLERANCE);

                if (!lines.ContainsKey(yKey))
                    lines[yKey] = new List<TextChunk>();

                lines[yKey].Add(chunk);
            }

            // 对每一行的文本按X坐标排序并合并
            var mergedChunks = new List<TextChunk>();

            foreach (var line in lines.Values)
            {
                // 按X坐标排序
                line.Sort((a, b) => a.Position.Left.CompareTo(b.Position.Left));

                // 合并同一行的文本
                TextChunk currentChunk = null;
                string currentText = "";

                foreach (var chunk in line)
                {
                    if (currentChunk == null)
                    {
                        currentChunk = chunk;
                        currentText = chunk.Text;
                    }
                    else
                    {
                        // 检查是否应该合并(X坐标接近)
                        float distance = chunk.Position.Left - currentChunk.Position.Right;

                        if (distance < 10) // 如果两个文本块距离小于10个单位,合并它们
                        {
                            currentText += chunk.Text;
                            currentChunk.Position.Right = chunk.Position.Right;
                        }
                        else
                        {
                            // 保存当前合并的文本块
                            mergedChunks.Add(new TextChunk
                            {
                                Text = currentText,
                                Position = currentChunk.Position
                            });

                            // 开始新的文本块
                            currentChunk = chunk;
                            currentText = chunk.Text;
                        }
                    }
                }

                // 保存最后一个合并的文本块
                if (currentChunk != null)
                {
                    mergedChunks.Add(new TextChunk
                    {
                        Text = currentText,
                        Position = currentChunk.Position
                    });
                }
            }

            return mergedChunks;
        }

        public string GetResultantText() => string.Empty;
        public void BeginTextBlock() { }
        public void EndTextBlock() { }
        public void RenderImage(ImageRenderInfo renderInfo) { }
    }


}

 

解析

AnalyzePage方法获取到关键字位置,ProcessLineWithTabs处理\t情况,WriteTextInArea写入新文本

最终效果

b68145d9-0e81-4d27-ad02-42bf94d1bcce

 

posted @ 2026-01-20 16:20  hello-*-world  阅读(10)  评论(0)    收藏  举报