From 15f51dd30304510c94b10e9f61e9055fd415e4d0 Mon Sep 17 00:00:00 2001 From: yuany3721 Date: Wed, 22 Apr 2026 10:10:17 +0800 Subject: [PATCH] docs: add nginx SNI + zerotier architecture post and nchan SSL renewal issue --- examples/blog/docs/blog/2026-04-21.md | 285 +++++++++++++++++++++++++ examples/blog/docs/notes/2026-04-22.md | 135 ++++++++++++ 2 files changed, 420 insertions(+) create mode 100644 examples/blog/docs/blog/2026-04-21.md create mode 100644 examples/blog/docs/notes/2026-04-22.md diff --git a/examples/blog/docs/blog/2026-04-21.md b/examples/blog/docs/blog/2026-04-21.md new file mode 100644 index 00000000..cbc46c77 --- /dev/null +++ b/examples/blog/docs/blog/2026-04-21.md @@ -0,0 +1,285 @@ +--- +title: 家庭服务外网访问架构重构:Nginx SNI 分流 + ZeroTier 虚拟组网 +createTime: 2026/04/21 20:51:22 +tags: + - nginx + - zerotier +description: 介绍一种基于 Nginx SNI 分流和 ZeroTier 虚拟组网的家庭服务外网访问架构,实现云服务器与内网物理机的协同工作 +--- + +## 背景与动机 + +### 硬件环境 + +- **云服务器**:阿里云 2核2G +- **内网物理机**:Dell R730XD(E5-2650 v4, 32G) + +### 之前的方案:frp 内网穿透 + +早期使用 [frp](https://github.com/fatedier/frp) 的内网穿透方案 将内网服务暴露到外网: + +```text +用户请求 -> 阿里云nginx -> frps(云服务器) -> frpc(内网) -> 目标服务 +``` + +frp 方案的局限性: + +1. 所有流量都要经过 frp 转发,额外增加一层代理开销 +2. 每新增一个服务,需要同时修改云服务器和内网机的配置 +3. 某些协议(如 WebSocket)需要额外配置支持 + +## 架构概述 + +本文介绍 SNI 分流 + ZeroTier 的架构方案核心组件: + +- **云服务器**:拥有公网 IP,作为流量入口,负责 SNI 分流 +- **ZeroTier**:建立云服务器与内网物理机的虚拟局域网隧道。参考[Zerotier组网的简单应用](./2024-07-01.md) +- **内网物理机**:运行实际服务,通过 ZeroTier 接收转发流量 + +核心思路:云服务器只做流量分发,实际服务运行在内网物理机。 + +### 网络拓扑 + +```text + 网络请求 + │ + ▼ + 阿里云 + │ + ┌─────────────┼───────────────┐ + │ │ │ + ▼ ▼ ▼ + frp.a.com zerotier.a.com *.a.com (默认) + │ │ │ + ▼ ▼ ▼ + frp dashboard zerotier ztncui ZeroTier 隧道 + │ + ▼ + ┌─────────────────────────────────────┐ + │ 内网物理机 (R730XD) │ + │ ZeroTier IP: 172.xx.xx.x │ + │ │ + │ ┌─────────┬─────────┬──────────┐ │ + │ │ service1│ service2│ service3 │ │ + │ │ service4│ service5│ ... │ │ + │ └─────────┴─────────┴──────────┘ │ + └─────────────────────────────────────┘ +``` + +### 优势 + +1. 内网服务不直接暴露公网,通过 ZeroTier 隧道通信 +2. 无需额外配置,新增/修改服务只需在内网配置,云服务器无需改动 +3. 降低云服务器负载压力,SNI 分流在四层完成,无需解密 HTTPS + +## 云服务器配置 + +云服务器作为流量入口,使用 Nginx 的 stream 模块进行四层代理和 SNI 分流。 + +1. `/etc/nginx/nginx.conf` + +- stream 块:处理 443 端口的 TCP 流量,根据 SNI 域名分流 +- http 块:处理 80 端口的 HTTP 流量,没有SNI,但是可以直接根据域名分流转发到内网 + +```nginx +stream { + # 内网物理机 upstream (通过 ZeroTier) + upstream home_backend { + server 172.xx.x.x:443; + } + + # 本地服务 upstream + upstream local_https { + server 127.0.0.1:8443; + } + + # SNI 路由表 + map $ssl_preread_server_name $backend { + frp.a.com local_https; + zerotier.a.com local_https; + default home_backend; + } + + server { + listen 443; + ssl_preread on; # 启用 SNI 预读取 + proxy_pass $backend; # 根据域名分流 + proxy_protocol on; # 传递真实 IP + } +} +``` + +2. HTTP 转发 + +`/etc/nginx/nginx.conf` (http 块) + +```nginx +server { + listen 80; + # 因为frps和zerotier配置了特定server_name,优先于泛域名匹配,因此不影响本地服务转发 + server_name *.a.com a.com; + client_max_body_size 500M; + + location / { + proxy_pass http://172.xx.x.x:80; + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + } +} +``` + +3. 本地 HTTPS 服务配置 + +`/etc/nginx/conf.d/frp.a.com`处理需要在云服务器本地处理的特定域名 + +```nginx +server { + listen 8443 ssl http2 proxy_protocol; + server_name frp.a.com; + + # 接收 proxy_protocol 传递的真实 IP + set_real_ip_from 127.0.0.1; + real_ip_header proxy_protocol; + + location / { + proxy_pass http://127.0.0.1:6001; + } + + ssl_certificate /etc/letsencrypt/live/frp.a.com/fullchain.pem; + ssl_certificate_key /etc/letsencrypt/live/frp.a.com/privkey.pem; +} + +# HTTP 重定向 +server { + listen 80; + server_name frp.a.com; + return 301 https://$host$request_uri; +} +``` + +## 内网物理机配置 + +内网物理机运行在家庭网络中,通过 ZeroTier 与云服务器组成虚拟局域网。 + +1. 内网服务配置 + +处理阿里云转发过来的域名请求,根据子域名反向代理到不同服务: + +`/etc/nginx/conf.d/service1.a.com` + +```nginx +server { + listen 443 ssl http2 proxy_protocol; + server_name service1.a.com; + + # 接收 proxy_protocol 传递的真实 IP + set_real_ip_from 127.0.0.1; + real_ip_header proxy_protocol; + + location / { + proxy_http_version 1.1; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header Host $http_host; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + proxy_pass http://127.0.0.1:7001; + } + + ssl_certificate /etc/letsencrypt/live/service1.a.com/fullchain.pem; + ssl_certificate_key /etc/letsencrypt/live/service1.a.com/privkey.pem; +} + +# HTTP 也被转发进来了,重定向 +server { + listen 80; + server_name service1.a.com; + return 301 https://$host$request_uri; +} +``` + +2. 主配置文件 + +`/etc/nginx/nginx.conf` + +内网 Nginx 使用标准的 HTTP 模块配置 + +```nginx +user www-data; +worker_processes auto; +error_log /var/log/nginx/error.log; + +http { + sendfile on; + tcp_nopush on; + types_hash_max_size 2048; + + include /etc/nginx/mime.types; + default_type application/octet-stream; + + ssl_protocols TLSv1 TLSv1.1 TLSv1.2 TLSv1.3; + ssl_prefer_server_ciphers on; + + access_log /var/log/nginx/access.log; + + gzip on; + + # 包含所有站点配置 + include /etc/nginx/conf.d/*.conf; + include /etc/nginx/conf.d/*.a.com; # 自己添加的include +} +``` + +## 证书管理 + +1. 云服务器证书 + +为需要在本地处理的域名申请独立证书: + +```text +frp.a.com +zerotier.a.com +``` + +2. 内网物理机证书 + +直接在内网就可以为泛域名和各个子域名申请证书(二选一即可): + +```text +# 泛域名证书 +*.a.com + +# 独立子域名证书 +service1.a.com +service2.a.com +service3.a.com +# ... +``` + +由于我直接用`certbot --nginx`配置的证书,直接用`certbot renew`即可更新证书,无需手动操作。 + +## 问题 + +1. SSL 协议错误 + +`ERR_SSL_PROTOCOL_ERROR` + +由于 stream 模块配置了 `proxy_protocol on`,监听ssl的 Nginx 配置必须使用 `proxy_protocol` 协议,否则无法正常握手。 + +云服务的服务、内网的服务,都需要加上。 + +2. Nginx nchan 模块导致 SSL 证书续期失败 + +Nginx nchan 模块导致 SSL 证书批量续期失败,可能是certbot大量重写配置文件并行重载nginx时导致的bug,参考:[Nginx nchan 模块导致 SSL 证书批量续期失败的调试记录](../notes/2026-04-22.md)。 + +## Reference + +- [Nginx Stream](http://nginx.org/en/docs/stream/ngx_stream_core_module.html) +- [Nginx SNI](https://nginx.org/en/docs/stream/ngx_stream_ssl_preread_module.html) +- [ZeroTier 官方文档](https://docs.zerotier.com/) +- [Certbot 文档](https://eff-certbot.readthedocs.io/) +- [Nginx Ticket #1135 - Connections timing out after upgrading to 1.10.2](https://trac.nginx.org/nginx/ticket/1135) diff --git a/examples/blog/docs/notes/2026-04-22.md b/examples/blog/docs/notes/2026-04-22.md new file mode 100644 index 00000000..be4a492f --- /dev/null +++ b/examples/blog/docs/notes/2026-04-22.md @@ -0,0 +1,135 @@ +--- +title: Nginx nchan 模块导致 SSL 证书批量续期失败 +createTime: 2026/04/22 08:43:00 +tags: + - nginx + - ssl + - certbot + - nchan +--- + +## 问题背景 + +在例行检查 SSL 证书自动续期时,发现 `certbot renew --dry-run` 命令出现大量失败。13 个域名中,前 6 个续期成功,后 7 个全部失败,返回 **504 Gateway Timeout** 错误。 + +## 环境信息 + +| 组件 | 版本/配置 | +| -------- | --------- | +| Nginx | 1.24.0 | +| Certbot | 2.9.0 | +| 域名数量 | 13 个 | + +## 错误信息 + +```text +Certbot failed to authenticate some domains (authenticator: nginx). +The Certificate Authority reported these problems: + Domain: example.a.com + Type: unauthorized + Detail: 12.34.56.78: Invalid response from + http://example.a.com/.well-known/acme-challenge/xxx: 504 +``` + +## 初步排查 + +### 1. 检查 Nginx 状态 + +Nginx 服务显示运行正常,但发现异常: + +```bash +$ pgrep nginx | wc -l +147 +``` + +有 **147 个 nginx 进程**,远超正常数量(通常 1 master + N workers)。 + +并且无法正常访问到任何nginx代理的服务,疑似进程阻塞。 + +### 2. 检查错误日志 + +```bash +$ grep "23:18" /var/log/nginx/error.log +``` + +发现大量 worker 进程崩溃记录: + +```text +2026/04/21 23:18:20 [alert] 149662#149662: worker process 153306 exited on signal 6 (core dumped) +2026/04/21 23:18:20 [alert] 149662#149662: shared memory zone "memstore" was locked by 153306 +2026/04/21 23:18:20 [alert] 149662#149662: worker process 153307 exited on signal 6 (core dumped) +2026/04/21 23:18:20 [alert] 149662#149662: shared memory zone "memstore" was locked by 153307 +... +``` + +统计崩溃次数: + +```bash +$ grep -c "exited on signal 6" /var/log/nginx/error.log +2361 +``` + +**2361 次 worker 进程崩溃!** + +## 问题分析 + +```text +certbot renew → 修改 nginx 配置 → nginx reload + → nchan 模块 bug → worker 崩溃 + → 无法处理请求 → Let's Encrypt 等待超过 60 秒 → 504 超时 +``` + +certbot 按顺序处理证书,每个证书需要: + +1. 修改 nginx 配置添加临时验证路径 +2. reload nginx +3. 等待 Let's Encrypt 验证 +4. 恢复配置 + +当处理到第 7 个证书时,频繁的 reload 触发了 nchan 模块的 bug,导致 worker 进程批量崩溃。此时 nginx 无法正常响应请求,后续所有证书验证都超时失败。 + +根据 [Nginx Ticket #1135](https://trac.nginx.org/nginx/ticket/1135) 的记录,nchan 模块在 nginx reload 时存在已知问题: + +> After upgrading from 1.10.1 without ALPN support to 1.10.2 with ALPN support... we've been getting into situations where Nginx completely stops serving connections without any warning. +> +> The nginx error log on the affected hosts gets these odd messages: +> +> ```text +> worker process exited on signal 6 (core dumped) +> shared memory zone "memstore" was locked by xxx +> ``` + +## 解决方案 + +禁用 nchan 模块 + +```bash +# 1. 找到 nchan 模块配置 +ls -la /etc/nginx/modules-enabled/ | grep nchan +# lrwxrwxrwx 1 root root 49 Apr 20 18:50 50-mod-nchan.conf -> /usr/share/nginx/modules-available/mod-nchan.conf + +# 2. 删除符号链接(禁用模块) +sudo rm /etc/nginx/modules-enabled/50-mod-nchan.conf + +# 3. 测试配置 +sudo nginx -t +# nginx: the configuration file /etc/nginx/nginx.conf syntax is ok +# nginx: configuration file /etc/nginx/nginx.conf test is successful + +# 4. 重启 nginx +sudo service nginx restart +``` + +重新运行 certbot 续期测试: + +```bash +sudo certbot renew --dry-run +``` + +结果 13 个证书全部续期成功 + +## 参考链接 + +- [Nginx Ticket #1135 - Connections timing out after upgrading to 1.10.2](https://trac.nginx.org/nginx/ticket/1135) +- [nchan 官方文档](https://nchan.io/) +- [Certbot 文档](https://eff-certbot.readthedocs.io/)