dom = etree.HTML(source) # source是一个response.text(),是html源码,,source在main函数中定义
# 以下为提取的项目,需要根据具体情况进行设置
articleId = re.compile(articleIdPattern).findall(link)[0]
title = dom.xpath(titlePattern)[0]
content = dom.xpath(contentPattern)[0]
content = tostring(content, encoding="utf-8", pretty_print=True, method="html").decode("utf-8")
category = re.compile(categoryPattern).findall(title)[0]
title = re.sub(r'_.*?_脚本之家', '', title)
content = html_encode(content) # 对html进行转码
articleUrl = link
crawlSign = 'Yes'
publishSign = 'No'
# 把各字段放入字典
information = {}
information.update(article_id=articleId, title=title, content=content, category=category, articleUrl=articleUrl,
crawlSign=crawlSign, publishSign=publishSign)
# print(information)
# 提取全部图片地址,保存到set(),另外把源码中的图片地址更改为新文件夹的地址
Post Views: 11