.Net iTextSharp获取关键字坐标并替换相关区域文本

最近遇到pdf修改报告结论的需求,也就是把上图结论部分替换成新的文本,我们可以根据关键字"结论"和"报告医生"获取这块区域的位置,同时还要支持文本相关字符下转义换行
全部代码
using iTextSharp.text; using iTextSharp.text.pdf; using iTextSharp.text.pdf.parser; using Newtonsoft.Json; using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using System.Threading.Tasks; using Web.Core.Log; using Web.Core.Orm; namespace Web.Core.Helper { public class PdfHelper { public static void ReplaceConclusion(string inputPdfPath, string outputPdfPath, string newConclusionText) { // 处理转义字符 string processedText = ProcessEscapeCharacters(newConclusionText); using (PdfReader reader = new PdfReader(inputPdfPath)) { // 先分析页面,找到关键位置 var pageInfo = AnalyzePage(reader, 1); using (FileStream fs = new FileStream(outputPdfPath, FileMode.Create)) { using (PdfStamper stamper = new PdfStamper(reader, fs)) { PdfContentByte canvas = stamper.GetOverContent(1); // 使用分析得到的区域坐标 if (pageInfo.ConclusionArea != null) { // 覆盖结论区域 canvas.SetColorFill(BaseColor.BLUE); canvas.Rectangle( pageInfo.ConclusionArea.Left, pageInfo.ConclusionArea.Bottom, pageInfo.ConclusionArea.Width, pageInfo.ConclusionArea.Height ); canvas.Fill(); // 写入新文本 WriteTextInArea(canvas, pageInfo.ConclusionArea, processedText); } else { // 如果找不到结论区域,使用默认坐标 throw new Exception("未获取到报告结论"); } } } } } private static PageInfo AnalyzePage(PdfReader reader, int pageNumber) { var strategy = new TextLocationExtractionStrategy(); PdfTextExtractor.GetTextFromPage(reader, pageNumber, strategy); var pageInfo = new PageInfo(); // 获取合并后的文本块 var mergedTextChunks = strategy.GetMergedTextChunks(); // 在合并后的文本块中查找关键词 foreach (var textChunk in mergedTextChunks) { string text = textChunk.Text; if (text.Contains("结论")) { pageInfo.ConclusionTitle = textChunk; } else if (text.Contains("报告者")) { pageInfo.Reporter = textChunk; } else if (text.Contains("报告医生")) { if (pageInfo.Reporter == null) { pageInfo.Reporter = textChunk; } } } LogHelper.Info($"AnalyzePage pageInfo {JsonConvert.SerializeObject(pageInfo)}"); // 计算结论区域 if (pageInfo.ConclusionTitle != null && pageInfo.Reporter != null) { pageInfo.ConclusionArea = new TextArea { Left = pageInfo.ConclusionTitle.Position.Left - 3, Top = pageInfo.ConclusionTitle.Position.Top - 3, // 结论标题下方 Right = 568, // 右边距 Bottom = pageInfo.Reporter.Position.Top + 20 // 报告者上方 }; } return pageInfo; } private static void ProcessLineWithTabs( PdfContentByte canvas, string line, float startX, float currentY, float tabWidth) { // 按制表符分割行 string[] segments = line.Split('\t'); // 当前X位置 float currentX = startX; for (int i = 0; i < segments.Length; i++) { // 设置当前位置并显示文本段 canvas.SetTextMatrix(currentX, currentY); canvas.ShowText(segments[i]); // 如果不是最后一个段,则计算下一个制表符位置 if (i < segments.Length - 1) { // 移动到下一个制表位 currentX = ((int)((currentX + GetTextWidth(segments[i])) / tabWidth) + 1) * tabWidth; } } } // 获取文本宽度(近似计算) private static float GetTextWidth(string text) { // 简单估算:每个字符大约6个单位宽度(根据字体大小10pt) // 这是一个近似值,如果需要精确计算,可以使用BaseFont的GetWidthPoint方法 return text.Length * 6; } private static void WriteTextInArea(PdfContentByte canvas, TextArea area, string text) { canvas.BeginText(); BaseFont baseFont = GetChineseFont(); canvas.SetFontAndSize(baseFont, 9); canvas.SetColorFill(BaseColor.WHITE); float lineHeight = 12; float currentY = area.Top - lineHeight; string[] lines = text.Split(new[] { "\r\n", "\r", "\n" }, StringSplitOptions.None); float tabWidth = 40; // 制表符宽度(可根据需要调整) foreach (var line in lines) { ProcessLineWithTabs(canvas, line, area.Left, currentY, tabWidth); currentY -= lineHeight; // 如果超出区域则停止 if (currentY < area.Bottom) break; } canvas.EndText(); } private static string ProcessEscapeCharacters(string text) { return text .Replace("\\n", "\n") .Replace("\\r", "\r") .Replace("\\t", "\t") .Replace("\\\\", "\\"); } private static BaseFont GetChineseFont() { try { // 尝试多个常见中文字体 string[] fontPaths = { @"C:\Windows\Fonts\msyh.ttc,0", // 微软雅黑 @"C:\Windows\Fonts\simsun.ttc,0", // 宋体 @"C:\Windows\Fonts\simhei.ttf", // 黑体 }; foreach (var fontPath in fontPaths) { if (File.Exists(fontPath.Replace(",0", "").Replace(",1", ""))) { return BaseFont.CreateFont(fontPath, BaseFont.IDENTITY_H, BaseFont.EMBEDDED); } } return BaseFont.CreateFont(BaseFont.HELVETICA, BaseFont.CP1252, BaseFont.EMBEDDED); } catch { return BaseFont.CreateFont(BaseFont.HELVETICA, BaseFont.CP1252, BaseFont.EMBEDDED); } } } // 支持类 public class PageInfo { public TextChunk ConclusionTitle { get; set; } public TextChunk Reporter { get; set; } public TextChunk Diagnosis { get; set; } public TextArea ConclusionArea { get; set; } } public class TextChunk { public string Text { get; set; } public TextPosition Position { get; set; } } public class TextPosition { public float Left { get; set; } public float Top { get; set; } public float Right { get; set; } public float Bottom { get; set; } } public class TextArea { public float Left { get; set; } public float Top { get; set; } public float Right { get; set; } public float Bottom { get; set; } public float Width => Right - Left; public float Height => Top - Bottom; } public class TextLocationExtractionStrategy : ITextExtractionStrategy { private List<TextChunk> _textChunks = new List<TextChunk>(); private const float TOLERANCE = 5f; // 合并文本块的容差范围 public void RenderText(TextRenderInfo renderInfo) { Vector baseline = renderInfo.GetBaseline().GetStartPoint(); string text = renderInfo.GetText(); // 获取文本的边界框 Vector ascentLine = renderInfo.GetAscentLine().GetStartPoint(); Vector descentLine = renderInfo.GetDescentLine().GetStartPoint(); // 估算文本宽度(简单的估算方法) float textWidth = renderInfo.GetSingleSpaceWidth() * text.Length; _textChunks.Add(new TextChunk { Text = text, Position = new TextPosition { Left = baseline[Vector.I1], Top = baseline[Vector.I2], Right = baseline[Vector.I1] + textWidth, Bottom = descentLine[Vector.I2] } }); } // 合并相邻的文本块 public List<TextChunk> GetMergedTextChunks() { if (_textChunks.Count == 0) return new List<TextChunk>(); // 按Y坐标分组(同一行的文本) var lines = new Dictionary<int, List<TextChunk>>(); foreach (var chunk in _textChunks) { // 将Y坐标取整,以便将相近高度的文本归为一行 int yKey = (int)Math.Round(chunk.Position.Top / TOLERANCE); if (!lines.ContainsKey(yKey)) lines[yKey] = new List<TextChunk>(); lines[yKey].Add(chunk); } // 对每一行的文本按X坐标排序并合并 var mergedChunks = new List<TextChunk>(); foreach (var line in lines.Values) { // 按X坐标排序 line.Sort((a, b) => a.Position.Left.CompareTo(b.Position.Left)); // 合并同一行的文本 TextChunk currentChunk = null; string currentText = ""; foreach (var chunk in line) { if (currentChunk == null) { currentChunk = chunk; currentText = chunk.Text; } else { // 检查是否应该合并(X坐标接近) float distance = chunk.Position.Left - currentChunk.Position.Right; if (distance < 10) // 如果两个文本块距离小于10个单位,合并它们 { currentText += chunk.Text; currentChunk.Position.Right = chunk.Position.Right; } else { // 保存当前合并的文本块 mergedChunks.Add(new TextChunk { Text = currentText, Position = currentChunk.Position }); // 开始新的文本块 currentChunk = chunk; currentText = chunk.Text; } } } // 保存最后一个合并的文本块 if (currentChunk != null) { mergedChunks.Add(new TextChunk { Text = currentText, Position = currentChunk.Position }); } } return mergedChunks; } public string GetResultantText() => string.Empty; public void BeginTextBlock() { } public void EndTextBlock() { } public void RenderImage(ImageRenderInfo renderInfo) { } } }
解析
AnalyzePage方法获取到关键字位置,ProcessLineWithTabs处理\t情况,WriteTextInArea写入新文本
最终效果


浙公网安备 33010602011771号