提交 8581f845 作者: 樊倪骄

爬虫

父级
# Python cache
__pycache__/
*.py[cod]
*.pyo
*.pyd
*.so
# Virtual environments
.venv/
venv/
env/
ENV/
# Test and tooling cache
.pytest_cache/
.mypy_cache/
.ruff_cache/
.coverage
.coverage.*
htmlcov/
# Build artifacts
build/
dist/
.eggs/
*.egg-info/
pip-wheel-metadata/
# Local environment and secrets
.env
.env.*
!.env.example
# Local runtime files
*.log
logs/
.data/
tmp/
temp/
*.db
*.sqlite
*.sqlite3
# IDE and OS files
.vscode/
.idea/
.DS_Store
Thumbs.db
# Jupyter
.ipynb_checkpoints/
import json
import time
import requests
from playwright.sync_api import sync_playwright
from newspaper import Article, ArticleException
from bs4 import BeautifulSoup
def get_static_html(url, timeout=10):
"""用 requests 获取静态 HTML(优先方案)"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
response = requests.get(url, headers=headers, timeout=timeout)
response.raise_for_status()
response.encoding = response.apparent_encoding # 自动检测编码
return response.text
except Exception as e:
print(f"requests 获取 {url} 失败:{e}")
return None
def get_dynamic_html(url, scroll_times=3, timeout=1000):
"""用 Playwright 获取动态 HTML(备用方案)"""
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
# 设置用户代理
context = browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)
page = context.new_page()
try:
page.goto(url, wait_until="networkidle", timeout=timeout*1000)
# 尝试等待内容区域加载(针对东方财富等网站)
try:
page.wait_for_selector(".xeditor_content, .article-body, article, .content", timeout=10000)
except:
pass # 如果找不到特定元素,继续执行
for _ in range(scroll_times):
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
page.wait_for_timeout(1000)
html = page.content()
except Exception as e:
print(f"Playwright 爬取 {url} 失败:{e}")
html = None
finally:
browser.close()
return html
def get_html_with_fallback(url, scroll_times=3, timeout=1000):
"""先尝试用 requests + BeautifulSoup,失败再用 Playwright"""
print(f"尝试使用 requests 获取 {url}...")
html = get_static_html(url)
if html:
# 尝试用 BeautifulSoup 验证是否能提取到内容
soup = BeautifulSoup(html, 'html.parser')
content_selectors = [
'.xeditor_content.app_h5_article',
'.xeditor_content',
'.article-body',
'.article-content',
'.content',
'article',
'[class*="article"]',
'[class*="content"]'
]
# 检查是否能找到内容
content_found = False
for selector in content_selectors:
if soup.select_one(selector):
content_found = True
break
# 如果找到了内容,返回 HTML
if content_found:
print(f"✓ requests 成功获取 {url},内容可用")
return html
else:
print(f"⚠ requests 获取了 HTML,但未找到内容,尝试使用 Playwright...")
else:
print(f"⚠ requests 获取失败,尝试使用 Playwright...")
# 如果静态获取失败或找不到内容,使用 Playwright
print(f"使用 Playwright 获取 {url}...")
html = get_dynamic_html(url, scroll_times, timeout)
if html:
print(f"✓ Playwright 成功获取 {url}")
return html
def extract_with_beautifulsoup(html, url):
"""用 BeautifulSoup 直接提取内容(备用方案)"""
try:
soup = BeautifulSoup(html, 'html.parser')
# 尝试多种常见的内容选择器
content_selectors = [
'.xeditor_content.app_h5_article', # 东方财富
'.xeditor_content',
'.article-body',
'.article-content',
'.content',
'article',
'[class*="article"]',
'[class*="content"]'
]
content_text = ""
title = ""
# 提取标题
title_selectors = ['h1', '.title', '.article-title', 'title']
for selector in title_selectors:
title_elem = soup.select_one(selector)
if title_elem:
title = title_elem.get_text(strip=True)
break
# 提取内容
for selector in content_selectors:
content_elem = soup.select_one(selector)
if content_elem:
# 移除脚本和样式标签
for script in content_elem(["script", "style", "a"]):
script.decompose()
content_text = content_elem.get_text(separator='\n', strip=True)
if content_text and len(content_text) > 50: # 确保内容足够长
break
# 如果还是没找到,尝试提取所有p标签
if not content_text or len(content_text) < 50:
paragraphs = soup.find_all('p')
content_text = '\n'.join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
return {
"url": url,
"title": title or soup.title.string if soup.title else "",
"text": content_text
}
except Exception as e:
print(f"BeautifulSoup 解析 {url} 失败:{e}")
return None
def extract_news_content(html, url):
"""用 Newspaper3k 提取内容,失败则用 BeautifulSoup"""
if not html:
return None
# 首先尝试用 Newspaper3k
article = Article(url)
article.set_html(html)
try:
article.parse()
# 如果提取的内容为空或太短,使用备用方案
if article.text and len(article.text.strip()) > 50:
return {
"url": url,
"title": article.title,
"text": article.text,
}
else:
print(f"Newspaper3k 提取的内容为空或太短,尝试使用 BeautifulSoup...")
except ArticleException as e:
print(f"Newspaper3k 解析 {url} 失败:{e},尝试使用 BeautifulSoup...")
except Exception as e:
print(f"Newspaper3k 解析 {url} 发生错误:{e},尝试使用 BeautifulSoup...")
# 备用方案:使用 BeautifulSoup
return extract_with_beautifulsoup(html, url)
def batch_crawl_news(url_list, output_file="news_aggregated.json"):
"""批量爬取并保存新闻内容"""
news_list = []
for i, url in enumerate(url_list, 1):
print(f"\n正在处理第 {i}/{len(url_list)} 个链接:{url}")
# 先尝试 requests + BeautifulSoup,失败再用 Playwright
html = get_html_with_fallback(url)
if not html:
print(f"✗ 无法获取 {url} 的 HTML,跳过")
continue
# 提取内容
news_data = extract_news_content(html, url)
if news_data:
news_list.append(news_data)
print(f"✓ 成功提取内容:{news_data.get('title', '无标题')[:50]}...")
else:
print(f"✗ 无法提取 {url} 的内容")
# 避免请求过快被封,间隔 2 秒
time.sleep(2)
# 保存到 JSON 文件
with open(output_file, "w", encoding="utf-8") as f:
json.dump(news_list, f, ensure_ascii=False, indent=2)
print(f"\n所有处理完成,共提取 {len(news_list)} 条新闻,已保存到 {output_file}")
# 测试:批量爬取科技新闻
if __name__ == "__main__":
# 待爬取的新闻链接列表(可替换为其他链接)
news_urls = [
"http://www.chnmodel.com/zcjh/moq/2018-07-23/5501.html",
"http://caifuhao.eastmoney.com/news/20240225070209973371220",
"https://caifuhao.eastmoney.com/news/20240225085325506722330",
"https://www.toutiao.com/article/7353226433517175305/",
"https://www.waytoagi.com/question/56972",
"http://www.chnmodel.com/qzgl/2018-07-18/4355.html",
"https://max.book118.com/html/2024/0125/6000210030010041.shtm"
]
# 批量爬取并保存
batch_crawl_news(news_urls, "tech_news.json")
\ No newline at end of file
"""
并发性能基准测试
对比不同并发数的爬取速度
"""
import asyncio
import time
from playwright.async_api import async_playwright
async def crawl_page_simple(browser, url: str, index: int):
"""简化的页面爬取(用于基准测试)"""
try:
page = await browser.new_page()
await page.goto(url, wait_until="domcontentloaded", timeout=30000)
title = await page.title()
await page.close()
return {"url": url, "title": title, "index": index}
except Exception as e:
return {"url": url, "error": str(e), "index": index}
async def benchmark(urls: list, max_concurrent: int):
"""基准测试指定并发数"""
print(f"\n{'='*60}")
print(f"测试配置: {len(urls)} 个URL, 并发数: {max_concurrent}")
print(f"{'='*60}")
start_time = time.time()
async with async_playwright() as p:
browser = await p.chromium.launch(
headless=True,
args=['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
)
semaphore = asyncio.Semaphore(max_concurrent)
async def crawl_with_semaphore(url, index):
async with semaphore:
return await crawl_page_simple(browser, url, index)
tasks = [crawl_with_semaphore(url, i+1) for i, url in enumerate(urls)]
results = await asyncio.gather(*tasks, return_exceptions=True)
await browser.close()
elapsed = time.time() - start_time
success_count = sum(1 for r in results if isinstance(r, dict) and "error" not in r)
print(f"\n✓ 完成!")
print(f" 总耗时: {elapsed:.2f}秒")
print(f" 成功: {success_count}/{len(urls)}")
print(f" 平均速度: {elapsed/len(urls):.2f}秒/页")
return elapsed, success_count
async def main():
"""主函数:对比不同并发数"""
# 测试 URL
test_urls = [
"https://docs.python.org/3/",
"https://github.com/trending",
"https://news.ycombinator.com/",
"https://www.wikipedia.org/",
"https://stackoverflow.com/",
"https://developer.mozilla.org/",
"https://www.rust-lang.org/",
"https://golang.org/",
"https://www.typescriptlang.org/",
"https://reactjs.org/",
]
print("\n" + "="*60)
print("并发性能对比测试")
print("="*60)
# 测试不同的并发数
concurrent_levels = [1, 2, 5, 10]
results = {}
for concurrent in concurrent_levels:
elapsed, success = await benchmark(test_urls, concurrent)
results[concurrent] = elapsed
await asyncio.sleep(2) # 间隔 2 秒避免服务器压力
# 打印对比结果
print("\n" + "="*60)
print("性能对比总结")
print("="*60)
baseline = results[1] # 以并发1为基准
for concurrent, elapsed in results.items():
speedup = baseline / elapsed
print(f"并发 {concurrent:2d}: {elapsed:6.2f}秒 (提速 {speedup:.2f}x)")
# 推荐配置
print("\n" + "="*60)
print("推荐配置")
print("="*60)
print("📊 根据测试结果:")
print(" • 并发 2-3:适合网络不稳定或资源受限场景")
print(" • 并发 5: 推荐配置,平衡性能和稳定性 ⭐")
print(" • 并发 8-10:追求极致速度,需要较好的网络和硬件")
print("\n当前 API 配置: 5 个并发 ✓")
if __name__ == "__main__":
asyncio.run(main())
"""
Gunicorn 配置文件(生产环境推荐)
使用方式:
gunicorn -c gunicorn_config.py api_server_optimized:app
或者:
gunicorn api_server_optimized:app \
--workers 4 \
--worker-class uvicorn.workers.UvicornWorker \
--bind 0.0.0.0:8106 \
--timeout 300
"""
import multiprocessing
import os
# ==========================================
# 服务器配置
# ==========================================
# 绑定地址和端口
bind = "0.0.0.0:8106"
# Worker 进程数
# 推荐:(2 x CPU核心数) + 1
# 对于爬虫服务,建议不要设置太多(因为每个 worker 都会有自己的并发池)
workers = int(os.getenv("WORKERS", "2")) # 默认 2 个 worker
# Worker 类(必须使用 UvicornWorker 以支持 ASGI/异步)
worker_class = "uvicorn.workers.UvicornWorker"
# 每个 worker 的线程数(对于异步应用,通常设为 1)
threads = 1
# ==========================================
# Worker 进程管理
# ==========================================
# Worker 超时时间(秒)
# 如果 worker 在这个时间内没有响应,会被强制重启
# 对于爬虫任务,需要设置较长的超时时间
timeout = 300 # 5 分钟
# 优雅重启超时时间(秒)
graceful_timeout = 30
# Worker 静默超时时间(秒)
# Worker 在这段时间内没有处理任何请求会被认为已卡死
keepalive = 5
# ==========================================
# 性能调优
# ==========================================
# 最大请求数(处理这么多请求后 worker 会重启,防止内存泄漏)
max_requests = 1000
max_requests_jitter = 50 # 随机抖动,避免所有 worker 同时重启
# 工作目录
chdir = os.path.dirname(os.path.abspath(__file__))
# ==========================================
# 日志配置
# ==========================================
# 日志级别
loglevel = "info"
# 访问日志格式
accesslog = "-" # 输出到 stdout
access_log_format = '%(h)s %(l)s %(u)s %(t)s "%(r)s" %(s)s %(b)s "%(f)s" "%(a)s" %(D)s'
# 错误日志
errorlog = "-" # 输出到 stderr
# 捕获输出(捕获 print 和标准输出)
capture_output = True
# ==========================================
# 进程命名
# ==========================================
proc_name = "crawler_api"
# ==========================================
# 服务器机制
# ==========================================
# 守护进程(后台运行,生产环境建议用 systemd/supervisor 管理)
daemon = False
# PID 文件
pidfile = "/tmp/crawler_api.pid"
# 用户和组(如果需要降权运行)
# user = "nobody"
# group = "nogroup"
# ==========================================
# 服务器钩子(可选)
# ==========================================
def on_starting(server):
"""服务器启动前"""
print(f"🚀 Gunicorn 正在启动...")
print(f" Workers: {workers}")
print(f" Bind: {bind}")
print(f" Worker Class: {worker_class}")
def on_reload(server):
"""服务器重载时"""
print("🔄 Gunicorn 正在重新加载...")
def when_ready(server):
"""服务器准备就绪"""
print(f"✅ Gunicorn 已启动完成")
print(f" 监听地址: http://{bind}")
print(f" API 文档: http://{bind.split(':')[0]}:{bind.split(':')[1]}/docs")
def on_exit(server):
"""服务器退出"""
print("👋 Gunicorn 正在关闭...")
def worker_int(worker):
"""Worker 收到 SIGINT 信号"""
print(f"⚠️ Worker {worker.pid} 收到中断信号")
def worker_abort(worker):
"""Worker 被强制终止"""
print(f"❌ Worker {worker.pid} 被强制终止")
# ==========================================
# SSL/TLS(如果需要 HTTPS)
# ==========================================
# keyfile = "/path/to/key.pem"
# certfile = "/path/to/cert.pem"
# ssl_version = "TLSv1_2"
# cert_reqs = 0 # ssl.CERT_NONE
# ca_certs = None
# ciphers = None
"""
使用 Playwright 异步 API 实现并发爬取
支持 2 个并发任务
"""
import asyncio
import time
from playwright.async_api import async_playwright
async def crawl_page(browser, url: str, index: int):
"""
爬取单个页面
Args:
browser: Playwright 浏览器实例
url: 要爬取的URL
index: 任务索引
Returns:
包含 URL、标题和内容的字典
"""
print(f"[任务 {index}] 开始爬取: {url}")
start_time = time.time()
try:
# 创建新页面(每个任务独立的页面)
page = await browser.new_page()
# 访问页面
await page.goto(url, wait_until="domcontentloaded", timeout=30000)
print(f"[任务 {index}] 页面已加载: {url}")
# 等待内容加载(可选)
try:
await page.wait_for_selector("body", timeout=5000)
except:
pass
# 提取标题
title = await page.title()
# 提取正文内容
# 尝试多种常见的内容选择器
content = ""
selectors = [
".article-content",
".article-body",
".content",
"article",
".post-content",
"main"
]
for selector in selectors:
try:
element = await page.query_selector(selector)
if element:
content = await element.inner_text()
if len(content) > 100: # 确保内容足够长
break
except:
continue
# 如果没找到特定容器,提取所有段落
if not content or len(content) < 100:
paragraphs = await page.query_selector_all("p")
texts = []
for p in paragraphs:
text = await p.inner_text()
if text.strip():
texts.append(text.strip())
content = "\n".join(texts)
elapsed = time.time() - start_time
print(f"[任务 {index}] ✓ 完成: {title[:50]}... (耗时 {elapsed:.2f}秒)")
await page.close()
return {
"url": url,
"title": title,
"content": content[:500] + "..." if len(content) > 500 else content,
"content_length": len(content),
"elapsed": elapsed
}
except Exception as e:
elapsed = time.time() - start_time
print(f"[任务 {index}] ✗ 失败: {url} - {e} (耗时 {elapsed:.2f}秒)")
return {
"url": url,
"error": str(e),
"elapsed": elapsed
}
async def crawl_batch(urls: list, max_concurrent: int = 2):
"""
并发爬取多个页面
Args:
urls: URL 列表
max_concurrent: 最大并发数(默认2)
Returns:
爬取结果列表
"""
print(f"\n{'='*60}")
print(f"开始批量爬取: {len(urls)} 个URL, 最大并发数: {max_concurrent}")
print(f"{'='*60}\n")
overall_start = time.time()
async with async_playwright() as p:
# 启动浏览器(一个浏览器实例可以有多个页面)
browser = await p.chromium.launch(
headless=True,
args=[
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
]
)
# 创建信号量来控制并发数
semaphore = asyncio.Semaphore(max_concurrent)
async def crawl_with_semaphore(url, index):
"""带信号量控制的爬取"""
async with semaphore:
return await crawl_page(browser, url, index)
# 创建所有任务
tasks = [
crawl_with_semaphore(url, i + 1)
for i, url in enumerate(urls)
]
# 并发执行所有任务
results = await asyncio.gather(*tasks, return_exceptions=True)
# 关闭浏览器
await browser.close()
overall_elapsed = time.time() - overall_start
# 统计结果
success_count = sum(1 for r in results if isinstance(r, dict) and "error" not in r)
failed_count = len(results) - success_count
print(f"\n{'='*60}")
print(f"批量爬取完成!")
print(f"总耗时: {overall_elapsed:.2f}秒")
print(f"成功: {success_count} 个")
print(f"失败: {failed_count} 个")
print(f"{'='*60}\n")
return results
async def main():
"""主函数"""
# 测试 URL 列表(可以替换为你想爬取的网址)
test_urls = [
"https://www.toutiao.com/article/7353226433517175305/",
"https://www.waytoagi.com/question/56972",
"https://docs.python.org/3/",
]
# 执行爬取(5个并发)
results = await crawl_batch(test_urls, max_concurrent=5)
# 打印详细结果
print("\n详细结果:")
print("=" * 60)
for i, result in enumerate(results, 1):
if isinstance(result, dict):
if "error" in result:
print(f"\n[{i}] ✗ {result['url']}")
print(f" 错误: {result['error']}")
else:
print(f"\n[{i}] ✓ {result['url']}")
print(f" 标题: {result['title']}")
print(f" 内容长度: {result['content_length']} 字符")
print(f" 耗时: {result['elapsed']:.2f}秒")
else:
print(f"\n[{i}] ✗ 异常: {result}")
if __name__ == "__main__":
# 运行异步主函数
asyncio.run(main())
fastapi==0.104.1
uvicorn[standard]==0.24.0
pydantic==2.5.0
requests==2.31.0
playwright==1.40.0
newspaper3k==0.2.8
beautifulsoup4==4.12.2
lxml==4.9.3
#!/bin/bash
# 重启爬虫 API 服务
echo "=========================================="
echo "重启爬虫 API 服务"
echo "=========================================="
CURRENT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# 停止服务
bash "$CURRENT_DIR/stop.sh"
# 等待 2 秒
sleep 2
# 启动服务
bash "$CURRENT_DIR/start_background.sh"
#!/bin/bash
# 后台运行爬虫 API 服务
echo "=========================================="
echo "后台启动爬虫 API 服务"
echo "=========================================="
# 激活虚拟环境(如果有)
if [ -d "venv" ]; then
source venv/bin/activate
fi
# 获取当前目录
CURRENT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
LOG_DIR="$CURRENT_DIR/logs"
PID_FILE="$CURRENT_DIR/crawler_api.pid"
# 创建日志目录
mkdir -p "$LOG_DIR"
# 检查是否已经在运行
if [ -f "$PID_FILE" ]; then
OLD_PID=$(cat "$PID_FILE")
if ps -p "$OLD_PID" > /dev/null 2>&1; then
echo "⚠️ 服务已在运行 (PID: $OLD_PID)"
echo "如需重启,请先执行: ./stop.sh"
exit 1
else
echo "清理过期的 PID 文件..."
rm -f "$PID_FILE"
fi
fi
# Worker 数量
WORKERS=${WORKERS:-2}
echo "配置信息:"
echo " Workers: $WORKERS"
echo " 日志目录: $LOG_DIR"
echo " PID 文件: $PID_FILE"
echo ""
# 使用 nohup 后台运行
nohup gunicorn api_server_optimized:app \
-c gunicorn_config.py \
--workers $WORKERS \
--worker-class uvicorn.workers.UvicornWorker \
--bind 0.0.0.0:8106 \
--timeout 300 \
--pid "$PID_FILE" \
--access-logfile "$LOG_DIR/access.log" \
--error-logfile "$LOG_DIR/error.log" \
--log-level info \
> "$LOG_DIR/gunicorn.log" 2>&1 &
# 等待启动
sleep 2
# 检查是否启动成功
if [ -f "$PID_FILE" ]; then
PID=$(cat "$PID_FILE")
if ps -p "$PID" > /dev/null 2>&1; then
echo "✅ 服务已成功启动!"
echo " PID: $PID"
echo " API 地址: http://localhost:8106"
echo " API 文档: http://localhost:8106/docs"
echo ""
echo "查看日志:"
echo " 访问日志: tail -f $LOG_DIR/access.log"
echo " 错误日志: tail -f $LOG_DIR/error.log"
echo " 主日志: tail -f $LOG_DIR/gunicorn.log"
echo ""
echo "停止服务: ./stop.sh"
else
echo "❌ 服务启动失败,请查看日志:"
echo " cat $LOG_DIR/gunicorn.log"
exit 1
fi
else
echo "❌ 启动失败,未生成 PID 文件"
exit 1
fi
#!/bin/bash
# 开发环境启动脚本(使用 Uvicorn,支持热重载)
echo "=========================================="
echo "启动爬虫 API 服务(开发模式)"
echo "=========================================="
# 激活虚拟环境(如果有)
if [ -d "venv" ]; then
echo "激活虚拟环境..."
source venv/bin/activate
fi
echo "启动 Uvicorn 开发服务器(支持热重载)..."
echo "API 地址: http://localhost:8106"
echo "API 文档: http://localhost:8106/docs"
echo ""
# 启动 Uvicorn(开发模式,支持热重载)
exec python3 api_server_optimized.py
# 或者使用 uvicorn 命令行(支持更多选项)
# exec uvicorn api_server_optimized:app \
# --host 0.0.0.0 \
# --port 8106 \
# --reload \
# --log-level info
#!/bin/bash
# 生产环境启动脚本(使用 Gunicorn)
echo "=========================================="
echo "启动爬虫 API 服务(生产模式)"
echo "=========================================="
# 激活虚拟环境(如果有)
if [ -d "venv" ]; then
echo "激活虚拟环境..."
source venv/bin/activate
fi
# 检查 gunicorn 是否安装
if ! command -v gunicorn &> /dev/null; then
echo "❌ Gunicorn 未安装"
echo "安装命令: pip install gunicorn"
exit 1
fi
# 获取 CPU 核心数
CPU_CORES=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 2)
WORKERS=${WORKERS:-2} # 默认 2 个 worker
echo "系统 CPU 核心数: $CPU_CORES"
echo "启动 Worker 数量: $WORKERS"
echo ""
# 启动 Gunicorn
exec gunicorn api_server_optimized:app \
-c gunicorn_config.py \
--workers $WORKERS \
--worker-class uvicorn.workers.UvicornWorker \
--bind 0.0.0.0:8106 \
--timeout 300 \
--access-logfile - \
--error-logfile - \
--log-level info
#!/bin/bash
# 查看爬虫 API 服务状态
echo "=========================================="
echo "爬虫 API 服务状态"
echo "=========================================="
CURRENT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PID_FILE="$CURRENT_DIR/crawler_api.pid"
# 检查 PID 文件
if [ -f "$PID_FILE" ]; then
PID=$(cat "$PID_FILE")
echo "PID 文件: $PID_FILE"
echo "主进程 PID: $PID"
echo ""
# 检查进程是否存在
if ps -p "$PID" > /dev/null 2>&1; then
echo "✅ 服务运行中"
echo ""
echo "进程信息:"
ps aux | head -1
ps aux | grep -E "($PID|gunicorn.*api_server_optimized)" | grep -v grep
echo ""
# 检查端口
echo "端口监听:"
netstat -tlnp 2>/dev/null | grep 8106 || ss -tlnp 2>/dev/null | grep 8106 || echo " 无法获取端口信息"
echo ""
# 测试 API 连接
echo "API 健康检查:"
if command -v curl &> /dev/null; then
RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:8106/health 2>/dev/null)
if [ "$RESPONSE" = "200" ]; then
echo " ✅ API 响应正常 (HTTP $RESPONSE)"
curl -s http://localhost:8106/health | python3 -m json.tool 2>/dev/null || echo ""
else
echo " ⚠️ API 未响应或异常 (HTTP $RESPONSE)"
fi
else
echo " (curl 未安装,跳过检查)"
fi
else
echo "❌ 服务未运行(PID 文件存在但进程不存在)"
echo " 建议清理: rm -f $PID_FILE"
fi
else
echo "⚠️ PID 文件不存在"
# 尝试查找相关进程
PIDS=$(pgrep -f "gunicorn.*api_server_optimized")
if [ -n "$PIDS" ]; then
echo ""
echo "发现相关进程(可能未通过脚本启动):"
ps aux | head -1
ps aux | grep -E "gunicorn.*api_server_optimized" | grep -v grep
else
echo "❌ 服务未运行"
fi
fi
echo ""
echo "=========================================="
echo "快捷命令:"
echo " 启动: ./start_background.sh"
echo " 停止: ./stop.sh"
echo " 重启: ./restart.sh"
echo " 查看日志: tail -f logs/gunicorn.log"
echo "=========================================="
#!/bin/bash
# 停止后台运行的爬虫 API 服务
echo "=========================================="
echo "停止爬虫 API 服务"
echo "=========================================="
CURRENT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PID_FILE="$CURRENT_DIR/crawler_api.pid"
# 检查 PID 文件是否存在
if [ ! -f "$PID_FILE" ]; then
echo "⚠️ PID 文件不存在,服务可能未运行"
# 尝试查找进程
PIDS=$(pgrep -f "gunicorn.*api_server_optimized")
if [ -n "$PIDS" ]; then
echo "发现相关进程:"
ps aux | grep -E "gunicorn.*api_server_optimized" | grep -v grep
echo ""
read -p "是否停止这些进程?(y/n) " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
echo "$PIDS" | xargs kill -TERM
echo "✅ 已发送停止信号"
fi
else
echo "未发现运行中的服务"
fi
exit 0
fi
# 读取 PID
PID=$(cat "$PID_FILE")
# 检查进程是否存在
if ! ps -p "$PID" > /dev/null 2>&1; then
echo "⚠️ 进程 $PID 不存在,清理 PID 文件..."
rm -f "$PID_FILE"
exit 0
fi
echo "正在停止服务 (PID: $PID)..."
# 发送 TERM 信号(优雅关闭)
kill -TERM "$PID"
# 等待进程结束
TIMEOUT=30
for i in $(seq 1 $TIMEOUT); do
if ! ps -p "$PID" > /dev/null 2>&1; then
echo "✅ 服务已停止"
rm -f "$PID_FILE"
exit 0
fi
echo -n "."
sleep 1
done
echo ""
echo "⚠️ 进程未在 $TIMEOUT 秒内停止,发送 KILL 信号..."
kill -KILL "$PID"
sleep 1
if ! ps -p "$PID" > /dev/null 2>&1; then
echo "✅ 服务已强制停止"
rm -f "$PID_FILE"
else
echo "❌ 无法停止进程 $PID"
exit 1
fi
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论