# -*- coding: utf-8 -*- """ Spyder Editor news.py. """ import requests from bs4 import BeautifulSoup from datetime import datetime import json import xlwt def get_list(url): # 新闻链接 res=requests.get(url) res.encoding='utf-8' # 完整HTML html=BeautifulSoup(res.text,'html.parser') # 新闻列表 newList=[] for item in html.select('.news-item'): try: newObj={} newObj['title']=item.select('h2 a')[0].text newObj['url']=item.select('h2 a')[0].get('href') newList.append(newObj) except: print('出现异常') return newList def get_detail(url): # 新闻链接 res=requests.get(url) res.encoding='utf-8' # 完整HTML html=BeautifulSoup(res.text,'html.parser') # 新闻对象 result={} # 新闻标题 result['title']=html.select('.main-title')[0].text # 发布时间 timesource=html.select('.date-source span')[0].text createtime=datetime.strptime(timesource,'%Y年%m月%d日 %H:%M') createtime.strftime('%Y-%m-%d') result['createtime']=createtime # 新闻来源 result['place']=html.select('.date-source a')[0].text # 新闻内容 article=[] for p in html.select('#article p')[:-1]: article.append(p.text.strip()) articleText=' '.join(article) result['article']=articleText # 新闻作者 result['author']=html.select('.show_author')[0].text.strip('责任编辑:') # 新闻链接 result['url']=url return result if __name__ == "__main__": #主函数 newList=get_list('https://news.sina.com.cn/world/') # print(newList) # newObj=get_detail('http://news.sina.com.cn/c/2020-10-19/doc-iiznctkc6335371.shtml') # print(newObj) book = xlwt.Workbook(encoding='utf-8') sheet = book.add_sheet('ke_qq') head = ['标题','时间','作者','链接','来源','内容'] #表头 for h in range(len(head)): sheet.write(0,h,head[h]) #写入表头 for i,item in enumerate(newList): try: newObj=get_detail(item['url']) sheet.write(i+1,0,newObj['title']) sheet.write(i+1,1,newObj['createtime']) sheet.write(i+1,2,newObj['author']) sheet.write(i+1,3,newObj['url']) sheet.write(i+1,4,newObj['place']) sheet.write(i+1,5,newObj['article']) print (str(i),'写入成功') except: print (str(i),'出现异常') book.save('F:\ke.xls')
作者:China_Lzw
链接:https://blog.csdn.net/qq285679784/article/details/109163785
来源:CSDN
著作权归作者所有,转载请联系作者获得授权,切勿私自转载。