PSI进销存系统高可用架构与灾备方案设计
引言
进销存系统是企业运营的核心系统,一旦出现业务中断将直接影响企业日常经营。保证系统的高可用性和灾备能力,是企业级应用必备的要素。本文介绍 PSI 进销存系统的高可用架构与灾备方案设计。
高可用架构设计
整体架构采用多层级高可用设计:
| 层级 | 高可用策略 | 技术实现 |
|---|---|---|
| 接入层 | 负载均衡、多节点 | Nginx/HAProxy + Keepalived |
| 应用层 | 无状态服务、自动伸缩 | Docker + K8s |
| 数据层 | 主从复制、多副本 | MySQL + Redis Cluster |
| 缓存层 | 多节点、持久化 | Redis Sentinel/Cluster |
多活架构设计
同城双活架构实现业务无中断:
# Kubernetes 部署配置
---
# 应用服务 Deployment
apiVersion: apps/v1
kind: Deployment
metadata:
name: psi-app
namespace: psi-prod
spec:
replicas: 3
selector:
matchLabels:
app: psi-app
template:
metadata:
labels:
app: psi-app
version: v2.5
spec:
affinity:
# 反亲和性:应用 Pod 分散在不同节点
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- psi-app
topologyKey: kubernetes.io/hostname
containers:
- name: psi-app
image: psi/psi-app:v2.5.0
ports:
- containerPort: 8080
resources:
requests:
memory: "512Mi"
cpu: "500m"
limits:
memory: "1Gi"
cpu: "1000m"
env:
- name: DB_HOST
valueFrom:
configMapKeyRef:
name: psi-config
key: db_host
- name: REDIS_HOST
valueFrom:
configMapKeyRef:
name: psi-config
key: redis_host
readinessProbe:
httpGet:
path: /health/ready
port: 8080
initialDelaySeconds: 10
periodSeconds: 5
livenessProbe:
httpGet:
path: /health/live
port: 8080
initialDelaySeconds: 30
periodSeconds: 10
---
# Service 配置
apiVersion: v1
kind: Service
metadata:
name: psi-app
namespace: psi-prod
spec:
type: ClusterIP
ports:
- port: 80
targetPort: 8080
selector:
app: psi-app
---
# HPA 自动伸缩
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: psi-app-hpa
namespace: psi-prod
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: psi-app
minReplicas: 2
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
数据库高可用配置
MySQL 主从复制与自动故障切换:
# MySQL 主从配置示例 --- # 主库配置文件 my.cnf [mysqld] server-id = 1 log-bin = mysql-bin binlog_format = ROW binlog_row_image = FULL expire_logs_days = 7 max_connections = 500 # 启用 GTID 模式 gtid_mode = ON enforce_gtid_consistency = ON # 半同步复制 rpl_semi_sync_master_enabled = 1 rpl_semi_sync_slave_enabled = 1 # 读写分离 read_only = 0 super_read_only = 0 --- # 从库配置文件 my.cnf [mysqld] server-id = 2 log-bin = mysql-bin binlog_format = ROW relay_log = relay-bin read_only = 1 super_read_only = 1 # GTID 模式 gtid_mode = ON enforce_gtid_consistency = ON # 半同步复制 rpl_semi_sync_slave_enabled = 1 # 延迟复制(用于数据恢复) # stop slave; # CHANGE MASTER TO MASTER_DELAY = 3600; # start slave; --- # 使用 MySQL Router 实现读写分离 # mysqlrouter.ini [logger] level = INFO [router] bootstrap_server_addresses = mysql://mysql-0.mysql:3306,mysql-1.mysql:3306,mysql-2.mysql:3306 metadata_cache = mysql metadata_cache_routable = true connect_timeout = 3 read_only = true read_write_lowercase = true [routing:psi_read] bind_address = 0.0.0.0 bind_port = 7001 destinations = mysql-0.mysql:3306,mysql-1.mysql:3306 routing_strategy = round-robin protocol_classic = true [routing:psi_write] bind_address = 0.0.0.0 bind_port = 7002 destinations = mysql-primary.mysql:3306 routing_strategy = first-available protocol_classic = true
数据同步与复制策略
// 跨机房数据同步服务
class CrossRegionSyncService {
constructor(mysqlClient, redisClient, mqClient) {
this.mysql = mysqlClient;
this.redis = redisClient;
this.mq = mqClient;
}
// 初始化同步链路
async initSyncChannel(localRegion, remoteRegion) {
// 创建增量同步任务
await this.createBinlogSyncTask({
source: `${localRegion}-mysql`,
target: `${remoteRegion}-mysql`,
mode: 'async', // 异步复制
conflictStrategy: 'last_write_win' // 冲突时以最后写入为准
});
// 配置全量同步(首次初始化)
await this.fullSync({
source: `${localRegion}-mysql`,
target: `${remoteRegion}-mysql`,
tables: ['product', 'customer', 'supplier', 'warehouse', 'order', 'stock']
});
}
// 处理数据变更(通过 Canal 监听 binlog)
async handleBinlogChange(event) {
const { table, type, data, oldData } = event;
// 忽略某些只读表
if (this.isReadOnlyTable(table)) return;
// 构建同步消息
const syncMessage = {
table,
operation: type, // insert/update/delete
data,
oldData,
timestamp: Date.now(),
source: this.currentRegion
};
// 1. 先写入本地消息队列
await this.mq.publish('sync.queue', syncMessage);
// 2. 异步同步到 Redis(用于前端实时查询)
if (['product', 'customer'].includes(table)) {
await this.syncToRedis(table, data);
}
// 3. 标记需要同步到异地
await this.redis.sadd('pending_sync_regions',
['region-b', 'region-c'].filter(r => r !== this.currentRegion));
}
// 异步同步到其他region
async syncToRemoteRegions() {
const pendingRegions = await this.redis.smembers('pending_sync_regions');
for (const region of pendingRegions) {
try {
await this.syncToRegion(region);
} catch (error) {
console.error(`Sync to ${region} failed:`, error);
// 失败时加入重试队列
await this.addToRetryQueue(region);
}
}
}
// 一致性校验
async verifyDataConsistency() {
const tables = ['product', 'customer', 'order'];
for (const table of tables) {
const localCount = await this.mysql.query(`SELECT COUNT(*) as cnt FROM ${table}`);
const remoteCount = await this.remoteMysql.query(`SELECT COUNT(*) as cnt FROM ${table}`);
if (localCount[0].cnt !== remoteCount[0].cnt) {
await this.triggerAlert('data_inconsistency', {
table,
local: localCount[0].cnt,
remote: remoteCount[0].cnt
});
}
}
}
}
故障自动切换机制
// 自动故障检测与切换
class FailoverService {
constructor(healthCheck, loadBalancer, dnsManager) {
this.healthCheck = healthCheck;
this.loadBalancer = loadBalancer;
this.dnsManager = dnsManager;
this.failoverConfig = {
maxFailureCount: 3, // 连续失败3次触发切换
checkInterval: 5000, // 5秒检查一次
failoverTimeout: 30000, // 30秒内完成切换
recoveryThreshold: 3 // 恢复后连续成功3次才切回
};
}
// 启动故障检测
async startHealthCheck() {
setInterval(async () => {
await this.checkAllNodes();
}, this.failoverConfig.checkInterval);
}
// 检查所有节点健康状态
async checkAllNodes() {
const nodes = await this.getAllNodes();
for (const node of nodes) {
const isHealthy = await this.healthCheck.checkNode(node);
// 更新健康状态
await this.updateNodeHealth(node.id, isHealthy);
// 故障处理
if (!isHealthy) {
await this.handleNodeFailure(node);
} else {
await this.handleNodeRecovery(node);
}
}
}
// 处理节点故障
async handleNodeFailure(node) {
node.failureCount++;
console.log(`Node ${node.id} health check failed, count: ${node.failureCount}`);
if (node.failureCount >= this.failoverConfig.maxFailureCount) {
console.log(`Node ${node.id} marked as unhealthy, starting failover...`);
// 1. 标记节点不可用
await this.markNodeUnavailable(node);
// 2. 通知负载均衡器
await this.loadBalancer.removeNode(node);
// 3. 切换流量到备用节点
await this.switchToBackup(node);
// 4. 发送告警
await this.sendAlert('node_failover', {
node: node.id,
timestamp: new Date(),
originalRegion: node.region
});
}
}
// 切换到备用节点
async switchToBackup(failedNode) {
const backupNodes = await this.getBackupNodes(failedNode);
if (backupNodes.length === 0) {
console.error('No backup nodes available!');
return;
}
const backupNode = backupNodes[0];
// 切换数据库连接
await this.switchDatabaseConnection(failedNode, backupNode);
// 切换缓存连接
await this.switchCacheConnection(failedNode, backupNode);
// 更新 DNS(可选,用于异地切换)
if (failedNode.region !== backupNode.region) {
await this.dnsManager.update(failedNode.hostname, backupNode.ip);
}
console.log(`Traffic switched from ${failedNode.id} to ${backupNode.id}`);
}
// 节点恢复处理
async handleNodeRecovery(node) {
if (node.failureCount > 0) {
node.successCount++;
}
// 连续成功多次后恢复节点
if (node.successCount >= this.failoverConfig.recoveryThreshold) {
console.log(`Node ${node.id} recovered, adding back to pool...`);
await this.loadBalancer.addNode(node);
await this.markNodeAvailable(node);
node.failureCount = 0;
node.successCount = 0;
}
}
}
灾备演练计划
| 演练类型 | 频率 | 演练内容 | 预期目标 |
|---|---|---|---|
| 单机故障 | 每周 | 模拟单个应用节点故障 | 60秒内自动恢复 |
| 数据库故障 | 每月 | 模拟主库故障,切换到从库 | 5分钟恢复,数据不丢失 |
| 机房故障 | 每季度 | 模拟整个机房不可用 | 15分钟切换到备用机房 |
| 数据恢复 | 每月 | 从备份恢复数据到测试环境 | 数据完整,可查询 |
总结
PSI 进销存系统高可用架构的核心要点:
- 多层级高可用:接入层、应用层、数据层、缓存层均采用高可用设计
- 多活架构:同城双活,支持流量自动切换
- 数据同步:通过 binlog 实现跨机房数据同步
- 自动故障切换:健康检查 + 自动 failover
- 定期演练:制定灾备演练计划,验证系统可靠性
通过完善的高可用架构设计,可以确保 PSI 进销存系统 7×24 小时稳定运行,为企业业务连续性提供可靠保障。