ansible-devops/roles/gpu_drive/tasks/main.yml

43 lines
1.5 KiB
YAML
Raw Permalink Normal View History

2025-07-05 15:49:53 +08:00
- name: 创建角色专属日志目录
file:
path: "{{ log_base_dir }}/{{ driver.name }}"
state: directory
mode: "0750"
- name: 同步驱动脚本到目标服务器
copy:
src: "{{ driver.install_script }}"
dest: "{{ script_dest }}/{{ driver.install_script }}"
mode: "0755"
force: yes # 确保使用最新脚本
- name: 执行驱动操作(安装/卸载)
shell: |
{{ script_dest }}/{{ driver.install_script }} \
{{ operations[operation] }} \
{% if target_version is defined and target_version != "" %}--version {{ target_version }}{% endif %}
register: script_result
environment:
GPU_MODEL: "{{ hostvars[inventory_hostname]['gpu_model'] }}" # 注入主机硬件信息
retries: 3 # 企业级重试机制失败3次终止
delay: 30 # 重试间隔30秒
become: yes # 使用sudo执行
- name: 验证操作结果(安装时)
when: operation == "install"
shell: "{{ driver.service_check }}"
changed_when: false
failed_when: "GPU count: 0" in script_result.stderr
- name: 记录操作日志(企业级可观测性)
uri:
url: "http://logging.internal.com/api/ansible"
method: POST
body_format: json
body:
host: "{{ inventory_hostname }}"
component: "{{ driver.name }}_driver"
operation: "{{ operation }}"
version: "{{ target_version | default(driver.default_version) }}"
status: "{% if script_result.rc == 0 %}success{% else %}failed{% endif %}"