PDF数据提取------3.解析Demo

8/3/2015来源:C#应用人气:1768

PDF数据提取------3.解析Demo

1.PDF中文本字符串格式中关键值信息抓取(已完成)

简介:这种解析比较传统最简单主要熟练使用Regular ExPRession做语义识别和验证.例如抓取下面红色圈内关键信息

1.1

        string mettingData=GetMeetingData();               public string GetMeetingData()        {            string patternAll = @"(?<NDAandCAMDate>会\s*议\s*.{2,15}\d{2,4}\s*年\s*\d{1,2}\s*月\s*\d{1,2}\s*日.{0,15})";            PdfAnalyzer pa = new PdfAnalyzer();            PDFNet.Initialize();            PDFDoc doc = new PDFDoc(item);            doc.InitSecurityHandler();            List<PdfString> foundAll = pa.RegexSearchAllPages(doc, patternAll);            List<string> patternFilter = new List<string>();            patternFilter.Add(@"(?<year>\d{2,4})年(?<month>\d{1,2})月(?<day>\d{1,2})日((\(|\()(星期|周)(一|二|三|四|五|六|七)(\)|\)))?(上午)?(?<hour>\d{1,2})(\:|点|时)(?<minute>\d{1,2})");            patternFilter.Add(@"(?<year>\d{2,4})年(?<month>\d{1,2})月(?<day>\d{1,2})日((\(|\()(星期|周)(一|二|三|四|五|六|七)(\)|\)))?下午(?<hour>\d{1,2})(\:|点|时)(?<minute>\d{1,2})");            patternFilter.Add(@"(?<year>\d{2,4})年(?<month>\d{1,2})月(?<day>\d{1,2})日((\(|\()(星期|周)(一|二|三|四|五|六|七)(\)|\)))?(上午)?(?<hour>\d{1,2})点半");            patternFilter.Add(@"(?<year>\d{2,4})年(?<month>\d{1,2})月(?<day>\d{1,2})日((\(|\()(星期|周)(一|二|三|四|五|六|七)(\)|\)))?下午(?<hour>\d{1,2})点半");            patternFilter.Add(@"(?<year>\d{2,4})年(?<month>\d{1,2})月(?<day>\d{1,2})日((\(|\()(星期|周)(一|二|三|四|五|六|七)(\)|\)))?(上午)?(?<hour>\d{1,2})(点|时)");            patternFilter.Add(@"(?<year>\d{2,4})年(?<month>\d{1,2})月(?<day>\d{1,2})日((\(|\()(星期|周)(一|二|三|四|五|六|七)(\)|\)))?下午(?<hour>\d{1,2})(点|时)");            patternFilter.Add(@"(?<year>\d{2,4})年(?<month>\d{1,2})月(?<day>\d{1,2})日");            return GetMeetingDateFilter(foundAll, patternAll);        }        private string GetMeetingDateFilter(List<PdfString> foundAll, List<string> patternAll)        {            string meetingDate = "     ";            Match ma = null;            string result = string.Empty;            foreach (PdfString pdfString in foundAll)            {                result = pdfString.ToString().Replace(" ", "");                for (int i = 0; i < patternAll.Count; i++)                {                    ma = (new Regex(patternAll[i])).Match(result);                    if (ma.Success)                    {                        if (IsValid(ma))                            return meetingDate;                        else                            meetingDate = "     ";                    }                }            }            return meetingDate;        }

注解:

a.第一次通过通过 pa.RegexSearchAllPages(doc, patternAll);搜索所有关于时间数据信息

b.第二次通过正则匹配获取带有关键词信息Meeting Data

2.PDF类似表格形式关键值数据抓取。(已完成)

简介:这种格式需要用的封装数据结构PdfString类和PdfAnalyzer类,根据给定关键词在指定范围提取数据,例如提取下面数据。

2.1

private string GetPremium(string path, string ricCode)        {            string result = string.Empty;            PDFDoc doc = null;            try            {                PDFNet.Initialize();                doc = new PDFDoc(path);                doc.InitSecurityHandler();                if (doc == null)                {                    string msg = string.Format("can't load pdf to doc = new PDFDoc({0}); ", path);                    Logger.Log(msg, Logger.LogType.Error);                    return result;                }                int x1 = 0;                int y1 = 0;                PdfAnalyzer pa = new PdfAnalyzer();                List<PdfString> listX1 = pa.RegexSearchAllPages(doc, ricCode);                List<PdfString> listY1 = pa.RegexSearchAllPages(doc, @"[P|p]remium");                List<PdfString> listResult = pa.RegexSearchAllPages(doc, @"(?<Result>\d+\.\d+\%)");                if (listX1.Count == 0 || listY1.Count == 0 || listResult.Count == 0)                {                    string msg = string.Format("({0}),([P|p]remium) exist missing value ,so Gearing is empty value.", ricCode);                    Logger.Log(msg, Logger.LogType.Warning);                    return result;                }                x1 = System.Convert.ToInt32(listX1[0].Position.x1);                y1 = System.Convert.ToInt32(listY1[0].Position.y1);                int subX1 = 0;                int subY1 = 0;                //use Gearing position (x1,y1) to get the right result value                foreach (var item in listResult)                {                    subX1 = x1 - System.Convert.ToInt32(item.Position.x1);                    if (subX1 < 0) subX1 = 0 - subX1;                    subY1 = y1 - System.Convert.ToInt32(item.Position.y1);                    if (subY1 < 0) subY1 = 0 - subY1;                    if (subX1 <= 10 && subY1 <= 10)                    {                        result = item.ToString().Replace("%", "");                        return result;                    }                }                Logger.Log(string.Format("stock code:{0},extract premium failed .", ricCode), Logger.LogType.Error);                return result;            }            catch (Exception ex)            {                string msg = string.Format("PDF analysis failed for " + ricCode + "! Action: Need manually input gearing and premium \r\n error msg:{0}", ex.Message);                Logger.Log(msg, Logger.LogType.Warning);                return result;            }        }

3.需要PDF中大量数据转换到Excel中去 (已完成)

简介:基与2的延伸,加入一个自动模糊匹配到行和列边界范围,根据位置坐标排序提取正确数据信息。如图:

2.22.3

private void StartExtractFile()        {            List<List<string>> bulkFileFilter = null;            List<LineFound> bulkFile = null;            PDFNet.Initialize();            PDFDoc doc = new PDFDoc(config.FilePath1);            doc.InitSecurityHandler();            string patternTitle = @"コード";            int page = 3;            PdfString ricPosition = GetRicPosition(doc, patternTitle, page);            if (ricPosition == null)                return;            string patternRic = @"\d{4}";            string patternValue = @"(\-|\+)?\d+(\,|\.|\d)+";            bulkFile = GetValue(doc, ricPosition, patternRic, patternValue);            int indexOK = 0;            bulkFileFilter = FilterBulkFile(bulkFile, indexOK);            string filePath = Path.Combine(config.OutputFolder, string.Format("Type1ExtractedFromPdf{0}.csv", DateTime.Now.ToString("dd-MM-yyyy")));            if (File.Exists(filePath))                File.Delete(filePath);            XlsOrCsvUtil.GenerateStringCsv(filePath, bulkFileFilter);            AddResult(Path.GetFileNameWithoutExtension(filePath), filePath, "type1");        }        private List<List<string>> FilterBulkFile(List<LineFound> bulkFile, int indexOK)        {            List<List<string>> result = new List<List<string>>();            if (bulkFile == null || bulkFile.Count == 0)            {                Logger.Log("no value data extract from pdf");                return