CSharp: Tesseract OCR V5.0 in donet core 3.1
Reference resources
https://github.com/alex-doe/open-ocr-dotnet
https://github.com/tleyden/open-ocr/ go
https://github.com/DayBreak-u/chineseocr_lite
https://github.com/pjreddie/darknet
https://sourceforge.net/projects/vietocr/
https://github.com/PaddlePaddle/PaddleOCR
https://github.com/gumblex/tessdata_chi
https://github.com/nobody132/masr
https://code.google.com/p/tesseractdotnet
https://github.com/tesseract-ocr/tesseract
https://github.com/tesseract-ocr/tessdata
https://github.com/charlesw/tesseract
https://github.com/charlesw/tesseract-samples
https://github.com/tesseract-ocr/langdata

/// <summary>
/// https://tesseract-ocr.github.io/tessdoc/Data-Files
/// geovindu,Geovin Du
/// </summary>
public enum LanguangeList
{
/// <summary>
/// Afrikaans afr.traineddata
/// </summary>
afr,
/// <summary>
/// Amharic amh.traineddata
/// </summary>
amh,
/// <summary>
/// Arabic ara.traineddata
/// </summary>
ara,
/// <summary>
/// Assamese asm.traineddata
/// </summary>
asm,
/// <summary>
/// Azerbaijani aze.traineddata
/// </summary>
aze,
/// <summary>
/// Azerbaijani - Cyrillic aze_cyrl.traineddata
/// </summary>
aze_cyrl,
/// <summary>
/// Belarusian bel.traineddata
/// </summary>
bel,
/// <summary>
/// Bengali ben.traineddata
/// </summary>
ben,
/// <summary>
/// Tibetan bod.traineddata
/// </summary>
bod,
/// <summary>
/// Bosnian bos.traineddata
/// </summary>
bos,
/// <summary>
/// Bulgarian bul.traineddata
/// </summary>
bul,
/// <summary>
/// Catalan; Valencian cat.traineddata
/// </summary>
cat,
/// <summary>
/// Cebuano ceb.traineddata
/// </summary>
ceb,
/// <summary>
/// Czech ces.traineddata
/// </summary>
ces,
/// <summary>
/// 简体中文
/// Chinese - Simplified chi_sim.traineddata
/// </summary>
chi_sim,
/// <summary>
/// 繁体中文
/// Chinese - Traditional chi_tra.traineddata
/// </summary>
chi_tra,
/// <summary>
/// Cherokee chr.traineddata
/// </summary>
chr,
/// <summary>
/// Welsh cym.traineddata
/// </summary>
cym,
/// <summary>
/// Danish dan.traineddata
/// </summary>
dan,
/// <summary>
/// German deu.traineddata
/// </summary>
deu,
/// <summary>
/// Dzongkha dzo.traineddata
/// </summary>
dzo,
/// <summary>
/// Greek, Modern (1453-) ell.traineddata
/// </summary>
ell,
/// <summary>
/// English eng.traineddata
/// </summary>
eng,
/// <summary>
/// English, Middle (1100-1500) enm.traineddata
/// </summary>
enm,
/// <summary>
/// Esperanto epo.traineddata
/// </summary>
epo,
/// <summary>
/// Estonian est.traineddata
/// </summary>
est,
/// <summary>
/// Basque eus.traineddata
/// </summary>
eus,
/// <summary>
/// Persian fas.traineddata
/// </summary>
fas,
/// <summary>
/// Finnish fin.traineddata
/// </summary>
fin,
/// <summary>
/// French fra.traineddata
/// </summary>
fra,
/// <summary>
/// German Fraktur frk.traineddata
/// </summary>
frk,
/// <summary>
/// French, Middle (ca. 1400-1600) frm.traineddata
/// </summary>
frm,
/// <summary>
/// Irish gle.traineddata
/// </summary>
gle,
/// <summary>
/// Galician glg.traineddata
/// </summary>
glg,
/// <summary>
/// Greek, Ancient (-1453) grc.traineddata
/// </summary>
grc,
/// <summary>
/// Gujarati guj.traineddata
/// </summary>
guj,
/// <summary>
/// Haitian; Haitian Creole hat.traineddata
/// </summary>
hat,
/// <summary>
/// Hebrew heb.traineddata
/// </summary>
heb,
/// <summary>
/// Hindi hin.traineddata
/// </summary>
hin,
/// <summary>
/// Croatian hrv.traineddata
/// </summary>
hrv,
/// <summary>
/// Hungarian hun.traineddata
/// </summary>
hun,
/// <summary>
/// Inuktitut iku.traineddata
/// </summary>
iku,
/// <summary>
/// Indonesian ind.traineddata
/// </summary>
ind,
/// <summary>
/// Icelandic isl.traineddata
/// </summary>
isl,
/// <summary>
/// Italian ita.traineddata
/// </summary>
ita,
/// <summary>
/// Italian - Old ita_old.traineddata
/// </summary>
ita_old,
/// <summary>
/// Javanese jav.traineddata
/// </summary>
jav,
/// <summary>
/// Japanese jpn.traineddata
/// </summary>
jpn,
/// <summary>
/// Kannada kan.traineddata
/// </summary>
kan,
/// <summary>
/// Georgian kat.traineddata
/// </summary>
kat,
/// <summary>
/// Georgian - Old kat_old.traineddata
/// </summary>
kat_old,
/// <summary>
/// Kazakh kaz.traineddata
/// </summary>
kaz,
/// <summary>
/// Central Khmer khm.traineddata
/// </summary>
khm,
/// <summary>
/// Kirghiz; Kyrgyz kir.traineddata
/// </summary>
kir,
/// <summary>
/// Korean kor.traineddata
/// </summary>
kor,
/// <summary>
/// Kurdish kur.traineddata
/// </summary>
kur,
/// <summary>
/// Lao lao.traineddata
/// </summary>
lao,
/// <summary>
/// Latin lat.traineddata
/// </summary>
lat,
/// <summary>
/// Latvian lav.traineddata
/// </summary>
lav,
/// <summary>
/// Lithuanian lit.traineddata
/// </summary>
lit,
/// <summary>
/// Malayalam mal.traineddata
/// </summary>
mal,
/// <summary>
/// Marathi mar.traineddata
/// </summary>
mar,
/// <summary>
/// Macedonian mkd.traineddata
/// </summary>
mkd,
/// <summary>
/// Maltese mlt.traineddata
/// </summary>
mlt,
/// <summary>
/// Malay msa.traineddata
/// </summary>
msa,
/// <summary>
/// Burmese mya.traineddata
/// </summary>
mya,
/// <summary>
/// Nepali nep.traineddata
/// </summary>
nep,
/// <summary>
/// Dutch; Flemish nld.traineddata
/// </summary>
nld,
/// <summary>
/// Norwegian nor.traineddata
/// </summary>
nor,
/// <summary>
/// Oriya ori.traineddata
/// </summary>
ori,
/// <summary>
/// Panjabi; Punjabi pan.traineddata
/// </summary>
pan,
/// <summary>
/// Polish pol.traineddata
/// </summary>
pol,
/// <summary>
/// Portuguese por.traineddata
/// </summary>
por,
/// <summary>
/// Pushto; Pashto pus.traineddata
/// </summary>
pus,
/// <summary>
/// Romanian; Moldavian; Moldovan ron.traineddata
/// </summary>
ron,
/// <summary>
/// Russian rus.traineddata
/// </summary>
rus,
/// <summary>
/// Sanskrit san.traineddata
/// </summary>
san,
/// <summary>
/// Sinhala; Sinhalese sin.traineddata
/// </summary>
sin,
/// <summary>
/// Slovak slk.traineddata
/// </summary>
slk,
/// <summary>
/// Slovenian slv.traineddata
/// </summary>
slv,
/// <summary>
/// Spanish; Castilian spa.traineddata
/// </summary>
spa,
/// <summary>
/// Spanish; Castilian - Old spa_old.traineddata
/// </summary>
spa_old,
/// <summary>
/// Albanian sqi.traineddata
/// </summary>
sqi,
/// <summary>
/// Serbian srp.traineddata
/// </summary>
srp,
/// <summary>
/// Serbian - Latin srp_latn.traineddata
/// </summary>
srp_latn,
/// <summary>
/// Swahili swa.traineddata
/// </summary>
swa,
/// <summary>
/// Swedish swe.traineddata
/// </summary>
swe,
/// <summary>
/// Syriac syr.traineddata
/// </summary>
syr,
/// <summary>
/// Tamil tam.traineddata
/// </summary>
tam,
/// <summary>
/// Telugu tel.traineddata
/// </summary>
tel,
/// <summary>
/// Tajik tgk.traineddata
/// </summary>
tgk,
/// <summary>
/// Tagalog tgl.traineddata
/// </summary>
tgl,
/// <summary>
/// Thai tha.traineddata
/// </summary>
tha,
/// <summary>
/// Tigrinya tir.traineddata
/// </summary>
tir,
/// <summary>
/// Turkish tur.traineddata
/// </summary>
tur,
/// <summary>
/// Uighur; Uyghur uig.traineddata
/// </summary>
uig,
/// <summary>
/// Ukrainian ukr.traineddata
/// </summary>
ukr,
/// <summary>
/// Urdu urd.traineddata
/// </summary>
urd,
/// <summary>
/// Uzbek uzb.traineddata
/// </summary>
uzb,
/// <summary>
/// Uzbek - Cyrillic uzb_cyrl.traineddata
/// </summary>
uzb_cyrl,
/// <summary>
/// Vietnamese vie.traineddata
/// </summary>
vie,
/// <summary>
/// Yiddish yid.traineddata
/// </summary>
yid
}
/// <summary>
///
/// </summary>
internal class Program
{
/// <summary>
/// https://tesseract-ocr.github.io/tessdoc/Data-Files
/// https://github.com/charlesw/tesseract-samples
/// https://github.com/tesseract-ocr/tessdata
/// https://github.com/danbloomberg/leptonica/releases
/// </summary>
/// <param name="args"></param>
public static void Main(string[] args)
{
var testImagePath = "./geovindu2.jpg"; //phototest.tif
if (args.Length > 0)
{
testImagePath = args[0];
}
try
{
using (var engine = new TesseractEngine(@"./tessdata", LanguangeList.chi_sim.ToString(), EngineMode.Default)) //chi_tra //eng //
{
using (var img = Pix.LoadFromFile(testImagePath))
{
using (var page = engine.Process(img))
{
var text = page.GetText();
Console.WriteLine("Mean confidence: {0}", page.GetMeanConfidence());
Console.WriteLine("Text (GetText): \r\n{0}", text);
Console.WriteLine("Text (iterator):");
using (var iter = page.GetIterator())
{
iter.Begin();
do
{
do
{
do
{
do
{
if (iter.IsAtBeginningOf(PageIteratorLevel.Block))
{
Console.WriteLine("<BLOCK>");
}
Console.Write(iter.GetText(PageIteratorLevel.Word));
Console.Write(" ");
if (iter.IsAtFinalOf(PageIteratorLevel.TextLine, PageIteratorLevel.Word))
{
Console.WriteLine();
}
} while (iter.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word));
if (iter.IsAtFinalOf(PageIteratorLevel.Para, PageIteratorLevel.TextLine))
{
Console.WriteLine();
}
} while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine));
} while (iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para));
} while (iter.Next(PageIteratorLevel.Block));
}
}
}
}
}
catch (Exception e)
{
Trace.TraceError(e.ToString());
Console.WriteLine("Unexpected Error: " + e.Message);
Console.WriteLine("Details: ");
Console.WriteLine(e.ToString());
}
Console.Write("Press any key to continue . . . ");
Console.ReadKey(true);
}
}
/// <summary>
/// Description of MainForm.
/// 引用 leptonica-1.82.0.dll
/// geovindu,Geovin Du
/// </summary>
public class DefaultPage : System.Web.UI.Page
{
#region Data
// input panel controls
protected Panel inputPanel;
protected HtmlInputFile imageFile;
protected HtmlButton submitFile;
// result panel controls
protected Panel resultPanel;
protected HtmlGenericControl meanConfidenceLabel;
protected HtmlTextArea resultText;
protected HtmlButton restartButton;
#endregion
#region Event Handlers
private void OnSubmitFileClicked(object sender, EventArgs args)
{
if (imageFile.PostedFile != null && imageFile.PostedFile.ContentLength > 0)
{
// for now just fail hard if there's any error however in a propper app I would expect a full demo.
//chi_sim https://tesseract-ocr.github.io/tessdoc/Data-Files
using (var engine = new TesseractEngine(Server.MapPath(@"~/tessdata"), LanguangeList.chi_sim.ToString(), EngineMode.Default))//eng //chi_sim
{
// have to load Pix via a bitmap since Pix doesn't support loading a stream.
using (var image = new System.Drawing.Bitmap(imageFile.PostedFile.InputStream))
{
using (var pix = PixConverter.ToPix(image))
{
using (var page = engine.Process(pix))
{
meanConfidenceLabel.InnerText = String.Format("{0:P}", page.GetMeanConfidence());
resultText.InnerText = page.GetText();
}
}
}
}
inputPanel.Visible = false;
resultPanel.Visible = true;
}
}
private void OnRestartClicked(object sender, EventArgs args)
{
resultPanel.Visible = false;
inputPanel.Visible = true;
}
#endregion
#region Page Setup
protected override void OnInit(EventArgs e)
{
InitializeComponent();
base.OnInit(e);
}
//----------------------------------------------------------------------
private void InitializeComponent()
{
this.restartButton.ServerClick += OnRestartClicked;
this.submitFile.ServerClick += OnSubmitFileClicked;
}
#endregion
}
输出:


GPTs are GPTs: An Early Look at the Labor Market Impact Potential of Large Language Models
https://arxiv.org/pdf/2303.10130.pdf
Sparks of Artificial General Intelligence: Early experiments with GPT-4
https://arxiv.org/abs/2303.12712
https://arxiv.org/pdf/2303.12712.pdf
浙公网安备 33010602011771号