class MyHTMLparser(HTMLParser):
def __init__(self) -> None:
super().__init__()
self.info = {
'event-title':None,
'datetime':None,
'event-location':None
}
self.__parserdata = None
# 除了datetime单独处理,其他的都直接索引
def handle_starttag(self,tag,attrs) -> None:
if tag == 'time':
self.info['datetime'] = attrs[0][1]
for dataTag in self.info.keys():
if ('class', dataTag) in attrs:
self.__parserdata = dataTag
def handle_data(self, data: str) -> None:
if not self.__parserdata is None:
self.info[self.__parserdata] = data
self.__parserdata = None
# 填充完毕后打印并重新初始化
def handle_endtag(self, tag: str) -> None:
if not None in self.info.values():
print(self.info)
for key in self.info: self.info[key] = None
每次都访问获取太慢了,直接把整个页面下载下来吧
if __name__ == '__main__':
url = 'https://www.python.org/events/python-events/'
if not Path('pythonEvent.html').exists:
req = request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.50')
with request.urlopen(req,timeout=30) as page:
html_doc = page.read().decode('utf-8')
print(f'status: {page.status} {page.reason}')
with Path('pythonEvent.html').open('w',encoding='utf-8') as f:
for line in html_doc:
f.write(line)
else:
with Path('pythonEvent.html').open('r',encoding='utf-8') as f:
html_doc = f.read()
parser = MyHTMLparser()
parser.feed(html_doc)
Spencer
解析器部分利用页面属性特征直接偷懒了
每次都访问获取太慢了,直接把整个页面下载下来吧