dom = etree.HTML(source) # source是一个response.text(),是html源码,,source在main函数中定义 # 以下为提取的项目,需要根据具体情况进行设置 articleId = re.compile(articleIdPattern).findall(link)[0] title = dom.xpath(titlePattern)[0] content = dom.xpath(contentPattern)[0] content = tostring(content, encoding="utf-8", pretty_print=True, method="html").decode("utf-8") category = re.compile(categoryPattern).findall(title)[0] title = re.sub(r'_.*?_脚本之家', '', title) content = html_encode(content) # 对html进行转码 articleUrl = link crawlSign = 'Yes' publishSign = 'No' # 把各字段放入字典 information = {} information.update(article_id=articleId, title=title, content=content, category=category, articleUrl=articleUrl, crawlSign=crawlSign, publishSign=publishSign) # print(information) # 提取全部图片地址,保存到set(),另外把源码中的图片地址更改为新文件夹的地址