python七大爬虫程序
一,index.php/tags-41973.html" class="superseo">���取豆瓣电影信息
()import random import urllib.request from bs4 import BeautifulSoup import codecs from time import sleep def main(url, headers): # 发送请求 page = urllib.request.Request(url, headers=headers) page = urllib.request.urlopen(page) contents = page.read() # 用BeautifulSoup解析网页 soup = BeautifulSoup(contents, "html.parser") infofile.write("") print('爬取豆瓣电影250: \n') for tag in soup.find_all(attrs={"class": "item"}): # 爬取序号 num = tag.find('em').get_text() print(num) infofile.write(num + "\r\n") # 电影名称 name = tag.find_all(attrs={"class": "title"}) zwname = name[0].get_text() print('[中文名称]', zwname) infofile.write("[中文名称]" + zwname + "\r\n") # 网页链接 url_movie = tag.find(attrs={"class": "hd"}).a urls = url_movie.attrs['href'] print('[网页链接]', urls) infofile.write("[网页链接]" + urls + "\r\n") # 爬取评分和评论数 info = tag.find(attrs={"class": "star"}).get_text() info = info.replace('\n', ' ') info = info.lstrip() print('[评分评论]', info) # 获取评语 info = tag.find(attrs={"class": "inq"}) if (info): # 避免没有影评调用get_text()报错 content = info.get_text() print('[影评]', content) infofile.write(u"[影评]" + content + "\r\n") print('') if __name__ == '__main__': # 存储文件 infofile = codecs.open("豆瓣电影信息.txt", 'a', 'utf-8') # 消息头 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'} # 翻页 i = 0 while i二,爬取知乎网页内容
import csv import requests import re import time def main(page): url = f'https://tieba.baidu.com/p/7882177660?pn={page}' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36' } resp = requests.get(url,headers=headers) html = resp.text # 评论内容 comments = re.findall('> (.*?)',html) # 评论用户 users = re.findall(' href=".*?" target="_blank">(.*?)',html) # 评论时间 comment_times = re.findall('楼(.*?)50: continue csvwriter.writerow((u,t,c)) print(u,t,c) print(f'第{page}页爬取完毕') if __name__ == '__main__': with open('01.csv','a',encoding='utf-8')as f: csvwriter = csv.writer(f) csvwriter.writerow(('评论用户','评论时间','评论内容')) for page in range(1,8): # 爬取前7页的内容 main(page) time.sleep(2)三,爬起天气预报
()import requests from bs4 import BeautifulSoup import urllib.request import random # 设置header 防止产生403forbidden my_headers = [ "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)", 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11', 'Opera/9.25 (Windows NT 5.1; U; en)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12', 'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9', "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7", "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 " ] # 抓取网页信息 def get_content(url, headers): random_header = random.choice(headers) req = urllib.request.Request(url) req.add_header("User-Agent", random_header) req.add_header("Host", "lishi.tianqi.com") req.add_header("Referer", "http://lishi.tianqi.com/") req.add_header("GET", url) content = urllib.request.urlopen(req).read() return content # 三个月份天气的链接 urls = ["http://lishi.tianqi.com/wuhan/202210.html", "http://lishi.tianqi.com/wuhan/202211.html", "http://lishi.tianqi.com/wuhan/202212.html"] file = open('weather.csv', 'w') for url in urls: response = get_content(url, my_headers) soup = BeautifulSoup(response, 'html.parser') weather_list = soup.select('ul[]') for weather in weather_list: ul_list = weather.select('li') for ul in ul_list: li_list = ul.select('div') str = "" for li in li_list: str += li.string + ',' file.write(str + '\n') file.close()四,爬取网页标题
import requests from bs4 import BeautifulSoup url = "http://project.webcat.top/bx/80607/24411" # 发送请求 response = requests.get(url) # 使用BeautifulSoup解析HTML内容 soup = BeautifulSoup(response.content,'html.parser') # 获取网站标题 title = soup.title.string print("网站标题:", title)五,爬取网页所有链接
import requests from bs4 import BeautifulSoup # 发送HTTP请求获取网页内容 url = 'https://www.python.org/' response = requests.get(url) html_content = response.text # 使用BeautifulSoup解析网页内容 soup = BeautifulSoup(html_content, 'html.parser') # 提取需要的数据 # 这里以提取网页中的所有链接为例 links = soup.find_all('a') for link in links: print(link.get('href'))六,爬取网页图片
import requests from bs4 import BeautifulSoup import urllib # 爬取网页数据并解析数据 url = 'http://vip.1905.com/m/play/1655899.shtml' # 替换为你要爬取的网页地址 response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') # 解析数据并获取影视图片的URL image_urls = [] images = soup.find_all('img') for image in images: image_url = image['src'] image_urls.append(image_url) # 下载图片并保存到本地文件 for image_url in image_urls: urllib.request.urlretrieve(image_url, 'rrr.jpg') # 替换为你要保存的文件名和路径七,爬取网页完整文本
import requests from bs4 import BeautifulSoup def scrape_html(url): # 发送HTTP请求 response = requests.get(url) if response.status_code == 200: soup = BeautifulSoup(response.text, 'html.parser') # 找到并打印所有的段落标签()的内容 for p_tag in soup.find_all('p'): print(p_tag.get_text()) else: print(f"Error: {response.status_code} when fetching {url}") # 测试函数 scrape_html('https://www.bafangwy.com/') # 替换为你要爬取的网址
The End