个人编制软件展示

PSI - Purchase Sale Inventory 进销存软件

PSI进销存系统高可用架构与灾备方案设计

引言

进销存系统是企业运营的核心系统,一旦出现业务中断将直接影响企业日常经营。保证系统的高可用性和灾备能力,是企业级应用必备的要素。本文介绍 PSI 进销存系统的高可用架构与灾备方案设计。

高可用架构设计

整体架构采用多层级高可用设计:

层级 高可用策略 技术实现
接入层 负载均衡、多节点 Nginx/HAProxy + Keepalived
应用层 无状态服务、自动伸缩 Docker + K8s
数据层 主从复制、多副本 MySQL + Redis Cluster
缓存层 多节点、持久化 Redis Sentinel/Cluster

多活架构设计

同城双活架构实现业务无中断:

# Kubernetes 部署配置
---
# 应用服务 Deployment
apiVersion: apps/v1
kind: Deployment
metadata:
  name: psi-app
  namespace: psi-prod
spec:
  replicas: 3
  selector:
    matchLabels:
      app: psi-app
  template:
    metadata:
      labels:
        app: psi-app
        version: v2.5
    spec:
      affinity:
        # 反亲和性:应用 Pod 分散在不同节点
        podAntiAffinity:
          preferredDuringSchedulingIgnoredDuringExecution:
          - weight: 100
            podAffinityTerm:
              labelSelector:
                matchExpressions:
                - key: app
                  operator: In
                  values:
                  - psi-app
              topologyKey: kubernetes.io/hostname
      containers:
      - name: psi-app
        image: psi/psi-app:v2.5.0
        ports:
        - containerPort: 8080
        resources:
          requests:
            memory: "512Mi"
            cpu: "500m"
          limits:
            memory: "1Gi"
            cpu: "1000m"
        env:
        - name: DB_HOST
          valueFrom:
            configMapKeyRef:
              name: psi-config
              key: db_host
        - name: REDIS_HOST
          valueFrom:
            configMapKeyRef:
              name: psi-config
              key: redis_host
        readinessProbe:
          httpGet:
            path: /health/ready
            port: 8080
          initialDelaySeconds: 10
          periodSeconds: 5
        livenessProbe:
          httpGet:
            path: /health/live
            port: 8080
          initialDelaySeconds: 30
          periodSeconds: 10

---
# Service 配置
apiVersion: v1
kind: Service
metadata:
  name: psi-app
  namespace: psi-prod
spec:
  type: ClusterIP
  ports:
  - port: 80
    targetPort: 8080
  selector:
    app: psi-app

---
# HPA 自动伸缩
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: psi-app-hpa
  namespace: psi-prod
spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: psi-app
  minReplicas: 2
  maxReplicas: 10
  metrics:
  - type: Resource
    resource:
      name: cpu
      target:
        type: Utilization
        averageUtilization: 70
  - type: Resource
    resource:
      name: memory
      target:
        type: Utilization
        averageUtilization: 80

数据库高可用配置

MySQL 主从复制与自动故障切换:

# MySQL 主从配置示例
---
# 主库配置文件 my.cnf
[mysqld]
server-id = 1
log-bin = mysql-bin
binlog_format = ROW
binlog_row_image = FULL
expire_logs_days = 7
max_connections = 500

# 启用 GTID 模式
gtid_mode = ON
enforce_gtid_consistency = ON

# 半同步复制
rpl_semi_sync_master_enabled = 1
rpl_semi_sync_slave_enabled = 1

# 读写分离
read_only = 0
super_read_only = 0

---
# 从库配置文件 my.cnf
[mysqld]
server-id = 2
log-bin = mysql-bin
binlog_format = ROW
relay_log = relay-bin
read_only = 1
super_read_only = 1

# GTID 模式
gtid_mode = ON
enforce_gtid_consistency = ON

# 半同步复制
rpl_semi_sync_slave_enabled = 1

# 延迟复制(用于数据恢复)
# stop slave;
# CHANGE MASTER TO MASTER_DELAY = 3600;
# start slave;

---
# 使用 MySQL Router 实现读写分离
# mysqlrouter.ini
[logger]
level = INFO

[router]
bootstrap_server_addresses = mysql://mysql-0.mysql:3306,mysql-1.mysql:3306,mysql-2.mysql:3306
metadata_cache = mysql
metadata_cache_routable = true
connect_timeout = 3
read_only = true
read_write_lowercase = true

[routing:psi_read]
bind_address = 0.0.0.0
bind_port = 7001
destinations = mysql-0.mysql:3306,mysql-1.mysql:3306
routing_strategy = round-robin
protocol_classic = true

[routing:psi_write]
bind_address = 0.0.0.0
bind_port = 7002
destinations = mysql-primary.mysql:3306
routing_strategy = first-available
protocol_classic = true

数据同步与复制策略

// 跨机房数据同步服务
class CrossRegionSyncService {
  constructor(mysqlClient, redisClient, mqClient) {
    this.mysql = mysqlClient;
    this.redis = redisClient;
    this.mq = mqClient;
  }

  // 初始化同步链路
  async initSyncChannel(localRegion, remoteRegion) {
    // 创建增量同步任务
    await this.createBinlogSyncTask({
      source: `${localRegion}-mysql`,
      target: `${remoteRegion}-mysql`,
      mode: 'async', // 异步复制
      conflictStrategy: 'last_write_win' // 冲突时以最后写入为准
    });

    // 配置全量同步(首次初始化)
    await this.fullSync({
      source: `${localRegion}-mysql`,
      target: `${remoteRegion}-mysql`,
      tables: ['product', 'customer', 'supplier', 'warehouse', 'order', 'stock']
    });
  }

  // 处理数据变更(通过 Canal 监听 binlog)
  async handleBinlogChange(event) {
    const { table, type, data, oldData } = event;

    // 忽略某些只读表
    if (this.isReadOnlyTable(table)) return;

    // 构建同步消息
    const syncMessage = {
      table,
      operation: type, // insert/update/delete
      data,
      oldData,
      timestamp: Date.now(),
      source: this.currentRegion
    };

    // 1. 先写入本地消息队列
    await this.mq.publish('sync.queue', syncMessage);

    // 2. 异步同步到 Redis(用于前端实时查询)
    if (['product', 'customer'].includes(table)) {
      await this.syncToRedis(table, data);
    }

    // 3. 标记需要同步到异地
    await this.redis.sadd('pending_sync_regions',
      ['region-b', 'region-c'].filter(r => r !== this.currentRegion));
  }

  // 异步同步到其他region
  async syncToRemoteRegions() {
    const pendingRegions = await this.redis.smembers('pending_sync_regions');

    for (const region of pendingRegions) {
      try {
        await this.syncToRegion(region);
      } catch (error) {
        console.error(`Sync to ${region} failed:`, error);
        // 失败时加入重试队列
        await this.addToRetryQueue(region);
      }
    }
  }

  // 一致性校验
  async verifyDataConsistency() {
    const tables = ['product', 'customer', 'order'];

    for (const table of tables) {
      const localCount = await this.mysql.query(`SELECT COUNT(*) as cnt FROM ${table}`);
      const remoteCount = await this.remoteMysql.query(`SELECT COUNT(*) as cnt FROM ${table}`);

      if (localCount[0].cnt !== remoteCount[0].cnt) {
        await this.triggerAlert('data_inconsistency', {
          table,
          local: localCount[0].cnt,
          remote: remoteCount[0].cnt
        });
      }
    }
  }
}

故障自动切换机制

// 自动故障检测与切换
class FailoverService {
  constructor(healthCheck, loadBalancer, dnsManager) {
    this.healthCheck = healthCheck;
    this.loadBalancer = loadBalancer;
    this.dnsManager = dnsManager;
    this.failoverConfig = {
      maxFailureCount: 3,        // 连续失败3次触发切换
      checkInterval: 5000,       // 5秒检查一次
      failoverTimeout: 30000,    // 30秒内完成切换
      recoveryThreshold: 3       // 恢复后连续成功3次才切回
    };
  }

  // 启动故障检测
  async startHealthCheck() {
    setInterval(async () => {
      await this.checkAllNodes();
    }, this.failoverConfig.checkInterval);
  }

  // 检查所有节点健康状态
  async checkAllNodes() {
    const nodes = await this.getAllNodes();

    for (const node of nodes) {
      const isHealthy = await this.healthCheck.checkNode(node);

      // 更新健康状态
      await this.updateNodeHealth(node.id, isHealthy);

      // 故障处理
      if (!isHealthy) {
        await this.handleNodeFailure(node);
      } else {
        await this.handleNodeRecovery(node);
      }
    }
  }

  // 处理节点故障
  async handleNodeFailure(node) {
    node.failureCount++;

    console.log(`Node ${node.id} health check failed, count: ${node.failureCount}`);

    if (node.failureCount >= this.failoverConfig.maxFailureCount) {
      console.log(`Node ${node.id} marked as unhealthy, starting failover...`);

      // 1. 标记节点不可用
      await this.markNodeUnavailable(node);

      // 2. 通知负载均衡器
      await this.loadBalancer.removeNode(node);

      // 3. 切换流量到备用节点
      await this.switchToBackup(node);

      // 4. 发送告警
      await this.sendAlert('node_failover', {
        node: node.id,
        timestamp: new Date(),
        originalRegion: node.region
      });
    }
  }

  // 切换到备用节点
  async switchToBackup(failedNode) {
    const backupNodes = await this.getBackupNodes(failedNode);

    if (backupNodes.length === 0) {
      console.error('No backup nodes available!');
      return;
    }

    const backupNode = backupNodes[0];

    // 切换数据库连接
    await this.switchDatabaseConnection(failedNode, backupNode);

    // 切换缓存连接
    await this.switchCacheConnection(failedNode, backupNode);

    // 更新 DNS(可选,用于异地切换)
    if (failedNode.region !== backupNode.region) {
      await this.dnsManager.update(failedNode.hostname, backupNode.ip);
    }

    console.log(`Traffic switched from ${failedNode.id} to ${backupNode.id}`);
  }

  // 节点恢复处理
  async handleNodeRecovery(node) {
    if (node.failureCount > 0) {
      node.successCount++;
    }

    // 连续成功多次后恢复节点
    if (node.successCount >= this.failoverConfig.recoveryThreshold) {
      console.log(`Node ${node.id} recovered, adding back to pool...`);

      await this.loadBalancer.addNode(node);
      await this.markNodeAvailable(node);

      node.failureCount = 0;
      node.successCount = 0;
    }
  }
}

灾备演练计划

演练类型 频率 演练内容 预期目标
单机故障 每周 模拟单个应用节点故障 60秒内自动恢复
数据库故障 每月 模拟主库故障,切换到从库 5分钟恢复,数据不丢失
机房故障 每季度 模拟整个机房不可用 15分钟切换到备用机房
数据恢复 每月 从备份恢复数据到测试环境 数据完整,可查询

总结

PSI 进销存系统高可用架构的核心要点:

通过完善的高可用架构设计,可以确保 PSI 进销存系统 7×24 小时稳定运行,为企业业务连续性提供可靠保障。

← 下一篇:PSI进销存系统工作流引擎与审批流程设计