python: xmlhelper
https://github.com/tesseract-ocr/tesseract
Tesseract引擎和中文包 (这是HP实验室最早开发的OCR)
https://pan.baidu.com/share/init?surl=XpeRVgiPTU7mmiMiyaXThg
pyth
https://digi.bib.uni-mannheim.de/tesseract/
https://github.com/ViewFaceCore/ViewFaceCore
pip install beautifulsoup4
xml:
<?xml version="1.0"?> <data> <country name="Liechtenstein"> <rank>1</rank> <year>2008</year> <gdppc>141100</gdppc> <neighbor name="Austria" direction="E"/> <neighbor name="Switzerland" direction="W"/> </country> <country name="Singapore"> <rank>4</rank> <year>2011</year> <gdppc>59900</gdppc> <neighbor name="Malaysia" direction="N"/> </country> <country name="Panama"> <rank>68</rank> <year>2011</year> <gdppc>13600</gdppc> <neighbor name="Costa Rica" direction="W"/> <neighbor name="Colombia" direction="E"/> </country> </data>
# encoding: utf-8 # 版权所有 2023 涂聚文有限公司 # 许可信息查看: # 描述: pip install beautifulsoup4 # Author : geovindu,Geovin Du 涂聚文. # IDE : PyCharm 2023.1 python 311 # Datetime : 2023/7/16 22:17 # User : geovindu # Product : PyCharm # Project : pythonTkinterDemo # File : XmlHelper.py # explain : 学习 from xml.dom import minidom import xml.etree.ElementTree as ET import csv import requests import os import sys def readXml(url): tree = ET.parse(url) root = tree.getroot() for child in root: print(child.tag, child.attrib) def writeXml(url): # 实例化Document树 doc = minidom.Document() # 创建根结点,XML必须存在root元素 root_node = doc.createElement('root') # 将元素挂载在doc树中 doc.appendChild(root_node) # 创建子元素 c_node1 = doc.createElement('movie') root_node.appendChild(c_node1) # 设置该元素存储数据 c_node1.setAttribute('shelf', 'New Arrivals') # 二级子结点 c_node2 = doc.createElement('type') c_node1.appendChild(c_node2) # 也用DOM创建文本结点,把文本结点(文字内容)看成子结点 c_text = doc.createTextNode("War, Thriller") c_node2.appendChild(c_text) try: with open(url, 'w', encoding='UTF-8') as f: # 第一个参数是目标文件对象 doc.writexml(f, indent='', addindent='\t', newl='\n', encoding='UTF-8') except Exception as e: print('错误:', e) def loadRSS(): # url of rss feed url = 'http://www.hindustantimes.com/rss/topnews/rssfeed.xml' # creating HTTP response object from given url resp = requests.get(url) # saving the xml file with open('topnewsfeed.xml', 'wb') as f: f.write(resp.content) def parseXML(xmlfile): # create element tree object tree = ET.parse(xmlfile) # get root element root = tree.getroot() # create empty list for news items newsitems = [] # iterate news items for item in root.findall('./channel/item'): # empty news dictionary news = {} # iterate child elements of item for child in item: # special checking for namespace object content:media if child.tag == '{http://search.yahoo.com/mrss/}content': news['media'] = child.attrib['url'] else: news[child.tag] = child.text.encode('utf8') # append news dictionary to news items list newsitems.append(news) # return news items list return newsitems def savetoCSV(newsitems, filename): # specifying the fields for csv file fields = ['guid', 'title', 'pubDate', 'description', 'link', 'media'] # writing to csv file with open(filename, 'w') as csvfile: # creating a csv dict writer object writer = csv.DictWriter(csvfile, fieldnames=fields) # writing headers (field names) writer.writeheader() # writing data rows writer.writerows(newsitems) def main(): # load rss from web to update existing xml file loadRSS() # parse xml file newsitems = parseXML('topnewsfeed.xml') # store news items in a csv file savetoCSV(newsitems, 'geovindu.csv')
哲学管理(学)人生, 文学艺术生活, 自动(计算机学)物理(学)工作, 生物(学)化学逆境, 历史(学)测绘(学)时间, 经济(学)数学金钱(理财), 心理(学)医学情绪, 诗词美容情感, 美学建筑(学)家园, 解构建构(分析)整合学习, 智商情商(IQ、EQ)运筹(学)生存.---Geovin Du(涂聚文)