我的CEPH OSD因为列阵卡不支持直通是基于RAID0的,最近ceph性能不足,排查过程iostat观察发现有一些盘r_await或w_await持续1000多,这就需要换盘了。ceph存储一份数据如果是默认3副本,那么就会3份副本写完才会完成写入,如果有1个osd延迟很高,就会影响整体写入速度。

首先可以运行 ceph-volume lvm list 查看osd对应的盘符,如果是bcache,则需要再运行lsblk查看在哪个盘符下。然后查看是哪个盘符,以及盘位,进行更换重建。

Python基于MegaCli获取raid卡状态的代码:https://raw.githubusercontent.com/eLvErDe/hwraid/master/wrapper-scripts/megaclisas-status

这个大佬的python脚本可以快速帮忙定位,运行完后再执行下:lsscsi 查看盘符在系统中的编号就可以定位到盘了,记得MegaCli得先安装。代码贴上:

#!/usr/bin/python
# $Id: megaclisas-status,v 1.78 2018/10/01 03:52:57 root Exp root $
#
# Written by Adam Cecile <gandalf@NOSPAM.le-vert.net>
# Modified by Vincent S. Cojot <vincent@NOSPAM.cojot.name>
#

import os
import re
import sys
import pdb
import inspect
import argparse

os.system('lsscsi')

if sys.platform == "win32":
    import ctypes

def_megaclipath = "/opt/MegaRAID/MegaCli/MegaCli64"

# Non-Nagios Mode defaults
nagiosmode = False
nagiosoutput = ""
nagiosgoodarray = 0
nagiosbadarray = 0
nagiosgooddisk = 0
nagiosbaddisk = 0

# Sane defaults
printarray = True
printcontroller = True
debugmode = False
notempmode = False
totaldrivenumber = 0
totalconfdrivenumber = 0
totalunconfdrivenumber = 0

# Hardcode a max of 16 HBA and 128 LDs for now. LDTable must be initialized to accept populating list of LD's into each ctlr's list.
MaxNumHBA = 16
MaxNumLD = 128
LDTable = [[] * MaxNumHBA for i in range(MaxNumLD)]
NestedLDTable = [[False for i in range(MaxNumLD)] for j in range(MaxNumHBA)]

# Outputs is a 'dict' of all MegaCLI outputs so we can re-use them during loops..
Outputs = {}
ConfDisks = {}
NagiosBadDisks = {}
NagiosGoodDisks = {}

# We need root access to query
if __name__ == "__main__":
    # deal with command line options
    parser = argparse.ArgumentParser()
    parser.add_argument("--nagios", help="enable nagios support", action="store_true")
    parser.add_argument("--debug", help="enable debugging output", action="store_true")
    parser.add_argument("--notemp", help="disable temperature reporting", action="store_true")

    args = parser.parse_args()
    nagiosmode = args.nagios
    debugmode = args.debug
    notempmode = args.notemp

    try:
        root_or_admin = os.geteuid() == 0
    except AttributeError:
        root_or_admin = ctypes.windll.shell32.IsUserAnAdmin() != 0
    if not root_or_admin:
        print("# This script requires Administrator privileges")
        sys.exit(5)

# Functions
def dbgprint(msg):
    if debugmode:
        sys.stderr.write(str("# DEBUG (" + str(inspect.currentframe().f_back.f_lineno) + ") : " + msg + "\n"))


def is_exe(fpath):
    return os.path.isfile(fpath) and os.access(fpath, os.X_OK)


def which(program):
    import os

    fpath, fname = os.path.split(program)
    if fpath:
        if is_exe(program):
            return program
    else:
        # Add some defaults
        os.environ["PATH"] += os.pathsep + "/opt/MegaRAID/MegaCli"
        os.environ["PATH"] += os.pathsep + "/ms/dist/hwmgmt/bin"
        os.environ["PATH"] += os.pathsep + "/opt/MegaRAID/perccli"
        os.environ["PATH"] += os.pathsep + "/opt/MegaRAID/storcli"
        os.environ["PATH"] += os.pathsep + "/opt/lsi/storcli"
        os.environ["PATH"] += os.pathsep + os.path.dirname(os.path.realpath(sys.argv[0]))
        for path in os.environ["PATH"].split(os.pathsep):
            dbgprint("Looking in PATH " + str(path))
            path = path.strip('"')
            exe_file = os.path.join(path, program)
            if is_exe(exe_file):
                dbgprint('Found "' + program + '" at ' + exe_file)
                return exe_file
    return None


# Find MegaCli
for megabin in "MegaCli64", "MegaCli", "megacli", "MegaCli.exe", "perccli64", "perccli", "storcli64", "storcli":
    dbgprint("Looking for " + str(megabin) + " in PATH...")
    megaclipath = which(megabin)
    if megaclipath != None:
        dbgprint("Will use this executable: " + str(megaclipath))
        break

# Check binary exists (and +x), if not print an error message
if megaclipath != None:
    if os.path.exists(megaclipath) and os.access(megaclipath, os.X_OK):
        pass
    else:
        if nagiosmode:
            print("UNKNOWN - Cannot find " + megaclipath)
        else:
            print("Cannot find " + megaclipath + "in your PATH. Please install it.")
        sys.exit(3)
else:
    print('Cannot find "MegaCli{64,}", "megacli{64,}", "perccli{64,}" or "storcli{64,}" in your PATH. Please install one of them.')
    sys.exit(3)


#### pdb.set_trace()


def returnWdthFromArrayCol(glarray, idx):
    maxwdth = 0
    for glrow in glarray:
        if len(glrow[idx]) > maxwdth:
            maxwdth = len(glrow[idx])
    return maxwdth


# Get and cache command output
def getOutput(cmd):
    lines = []
    if cmd in Outputs:
        dbgprint("Got Cached value: " + str(cmd))
        lines = Outputs[cmd]
    else:
        dbgprint("Not a Cached value: " + str(cmd))
        output = os.popen(cmd)
        for line in output:
            if not re.match(r"^$", line.strip()):
                lines.append(line.strip())
        Outputs[cmd] = lines
    return lines


# Get and cache disks, make sure we don't count the same disk twice
def AddDisk(mytable, disk):
    lines = []
    if disk in mytable:
        dbgprint("Disk: " + str(disk) + " Already present in Disk Table")
        return False
    else:
        dbgprint("Confed " + str(nagiosgooddisk) + "/" + str(nagiosbaddisk) + "Disk: " + str(disk) + " Not already present in Disk Table, adding")
        mytable[disk] = True
        return True


def returnControllerNumber(output):
    for line in output:
        if re.match(r"^Controller Count.*$", line.strip()):
            return int(line.split(":")[1].strip().strip("."))


def returnTotalDriveNumber(output):
    for line in output:
        if re.match(r"Number of Physical Drives on Adapter.*$", line.strip()):
            return int(line.split(":")[1].strip())


def returnRebuildProgress(output):
    percent = 0
    tmpstr = ""
    for line in output:
        if re.match(r"^Rebuild Progress on Device at Enclosure.*, Slot .* Completed ", line.strip()):
            tmpstr = line.split("Completed")[1].strip()
            percent = int(tmpstr.split("%")[0].strip())
    return percent


def returnConfDriveNumber(controllerid, output):
    # Count the configured drives
    confdrives = 0
    enclid = "N/A"
    slotid = "N/A"
    for line in output:

        if re.match(r"Enclosure Device ID: .*$", line.strip()):
            # We match here early in the analysis so reset the vars if this is a new disk we're reading..
            enclid = line.split(":")[1].strip()
        elif re.match(r"Slot Number: .*$", line.strip()):
            slotid = line.split(":")[1].strip()
            if AddDisk(ConfDisks, str(controllerid) + enclid + slotid):
                confdrives += 1
    return int(confdrives)


def returnUnConfDriveNumber(output):
    # Count the un-configured/Hotspare drives
    unconfdrives = 0
    for line in output:
        if re.match(r"^Firmware state: Unconfigured.*$", line.strip()):
            unconfdrives += 1
        elif re.match(r"^Firmware state: Hotspare.*$", line.strip()):
            unconfdrives += 1
    return int(unconfdrives)


def returnControllerModel(output):
    for line in output:
        if re.match(r"^Product Name.*$", line.strip()):
            return line.split(":")[1].strip()


def returnMemorySize(output):
    for line in output:
        if re.match(r"^Memory Size.*$", line.strip()):
            return line.split(":")[1].strip()


def returnFirmwareVersion(output):
    for line in output:
        if re.match(r"^FW Package Build.*$", line.strip()):
            return line.split(":")[1].strip()


def returnROCTemp(output):
    ROCtemp = ""
    tmpstr = ""
    if notempmode:
        return str("N/A")
    else:
        for line in output:
            if re.match(r"^ROC temperature :.*$", line.strip()):
                tmpstr = line.split(":")[1].strip()
                ROCtemp = re.sub(" +.*$", "", tmpstr)
        if ROCtemp != "":
            return str(str(ROCtemp) + "C")
        else:
            return str("N/A")


def returnBBUPresence(output):
    BBU = ""
    tmpstr = ""
    for line in output:
        if re.match(r"^BBU +:.*$", line.strip()):
            tmpstr = line.split(":")[1].strip()
            BBU = re.sub(" +.*$", "", tmpstr)
            break
    if BBU != "":
        return str(BBU)
    else:
        return str("N/A")


def returnBBUStatus(output):
    BBUStatus = ""
    tmpstr = ""
    for line in output:
        if re.match(r"^ *Battery Replacement required +:.*$", line.strip()):
            tmpstr = line.split(":")[1].strip()
            BBUStatus = re.sub(" +.*$", "", tmpstr)
            break
    if BBUStatus == "Yes":
        return str("REPL")
    else:
        return str("Good")


def returnArrayNumber(output):
    i = 0
    for line in output:
        if re.match(r"^(CacheCade )?Virtual Drive:.*$", line.strip()):
            i += 1
    return i


def returnHBAPCIInfo(output):
    busprefix = "0000"
    busid = ""
    devid = ""
    functionid = ""
    pcipath = ""
    for line in output:
        if re.match(r"^Bus Number.*:.*$", line.strip()):
            busid = str(line.strip().split(":")[1].strip()).zfill(2)
        if re.match(r"^Device Number.*:.*$", line.strip()):
            devid = str(line.strip().split(":")[1].strip()).zfill(2)
        if re.match(r"^Function Number.*:.*$", line.strip()):
            functionid = str(line.strip().split(":")[1].strip()).zfill(1)
    if busid:
        pcipath = str(busprefix + ":" + busid + ":" + devid + "." + functionid)
        dbgprint("Array PCI path : " + pcipath)
        return str(pcipath)
    else:
        return None


def returnHBAInfo(table, output, controllerid):
    controllermodel = "Unknown"
    controllerram = "Unknown"
    controllerrev = "Unknown"
    controllertemp = ""
    controllermodel = returnControllerModel(output)
    controllerram = returnMemorySize(output)
    controllerrev = returnFirmwareVersion(output)
    controllertemp = returnROCTemp(output)
    controllerbbu = returnBBUPresence(output)
    if controllerbbu == "Present":
        cmd = "%s -AdpBbuCmd -GetBbuStatus -a%d -NoLog" % (megaclipath, controllerid)
        output = getOutput(cmd)
        controllerbbu = returnBBUStatus(output)

    if controllermodel != "Unknown":
        table.append(["c" + str(controllerid), controllermodel, controllerram, str(controllertemp), str(controllerbbu), str("FW: " + controllerrev)])


def returnArrayInfo(output, controllerid, arrayid, arrayindex):
    id = "c" + str(controllerid) + "u" + str(arrayid)
    operationlinennumber = False
    linenumber = 0
    targetid = ""
    raidtype = ""
    raidlvl = ""
    size = ""
    state = "N/A"
    strpsz = ""
    dskcache = "N/A"
    properties = ""
    spandepth = 0
    diskperspan = 0
    cachecade_info = "None"

    for line in output:
        if re.match(r"^(CacheCade )?Virtual Drive:.*(Target Id: [0-9]+).*$", line.strip()):
            # Extract the SCSI Target ID
            targetid = line.strip().split(":")[2].split(")")[0].strip()
        elif re.match(r"^RAID Level.*?:.*$", line.strip()):
            # Extract the primary raid type, decide on X0 RAID level later when we hit Span Depth
            raidlvl = int(line.strip().split(":")[1].split(",")[0].split("-")[1].strip())
        elif re.match(r"^Size.*?:.*$", line.strip()):
            # Size reported in MB
            if re.match(r"^.*MB$", line.strip().split(":")[1]):
                size = line.strip().split(":")[1].strip("MB").strip()
                if float(size) > 1000:
                    size = str(int(round((float(size) / 1000)))) + "G"
                else:
                    size = str(int(round(float(size)))) + "M"
            # Size reported in TB
            elif re.match(r"^.*TB$", line.strip().split(":")[1]):
                size = line.strip().split(":")[1].strip("TB").strip()
                size = str(int(round((float(size) * 1000)))) + "G"
            # Size reported in GB (default)
            else:
                size = line.strip().split(":")[1].strip("GB").strip()
                size = str(int(round((float(size))))) + "G"
        elif re.match(r"^Span Depth.*?:.*$", line.strip()):
            # If Span Depth is greater than 1 chances are we have a RAID 10, 50 or 60
            spandepth = line.strip().split(":")[1].strip()
        elif re.match(r"^State.*?:.*$", line.strip()):
            state = line.strip().split(":")[1].strip()
        elif re.match(r"^Strip Size.*?:.*$", line.strip()):
            strpsz = line.strip().split(":")[1].strip()
        elif re.match(r"^Number Of Drives per span.*:.*$", line.strip()):
            diskperspan = int(line.strip().split(":")[1].strip())
        elif re.match(r"^Current Cache Policy.*?:.*$", line.strip()):
            props = line.strip().split(":")[1].strip()
            if re.search("ReadAdaptive", props):
                properties += "ADRA"
            if re.search("ReadAhead", props):
                properties += "RA"
            if re.match("ReadAheadNone", props):
                properties += "NORA"
            if re.search("WriteBack", props):
                properties += ",WB"
            if re.match("WriteThrough", props):
                properties += ",WT"
        elif re.match(r"^Disk Cache Policy.*?:.*$", line.strip()):
            props = line.strip().split(":")[1].strip()
            if re.search("Disabled", props):
                dskcache = "Disabled"
            if re.search("Disk.s Default", props):
                dskcache = "Default"
            if re.search("Enabled", props):
                dskcache = "Enabled"
        elif re.match(r"^Ongoing Progresses.*?:.*$", line.strip()):
            operationlinennumber = linenumber
        elif re.match(r"Cache Cade Type\s*:.*$", line):
            cachecade_info = "Type : " + line.strip().split(":")[1].strip()
        elif re.match(r"^Target Id of the Associated LDs\s*:.*$", line):
            associated = []
            for array in line.split(":")[1].strip().split(","):
                if array.isdigit():
                    associated.append("c%du%d" % (controllerid, int(array)))
            if len(associated) >= 1:
                cachecade_info = "Associated : %s" % (", ".join(associated))
        linenumber += 1

    # If there was an ongoing operation, find the relevant line in the previous output
    if operationlinennumber:
        inprogress = str(output[operationlinennumber + 1])
        # some ugly output fix..
        str1 = inprogress.split(":")[0].strip()
        str2 = inprogress.split(":")[1].strip()
        inprogress = str1 + " : " + str2
    else:
        inprogress = "None"

    # Compute the RAID level
    NestedLDTable[int(controllerid)][int(arrayindex)] = False
    if raidlvl == "":
        raidtype = str("N/A")
    else:
        if int(spandepth) >= 2:
            raidtype = str("RAID-" + str(raidlvl) + "0")
            NestedLDTable[controllerid][int(arrayindex)] = True
        else:
            if raidlvl == 1:
                if diskperspan > 2:
                    raidtype = str("RAID-10")
                    NestedLDTable[controllerid][int(arrayindex)] = True
                else:
                    raidtype = str("RAID-" + str(raidlvl))
            else:
                raidtype = str("RAID-" + str(raidlvl))

    dbgprint("RAID Level: " + str(raidlvl) + " Span Depth: " + str(spandepth) + " Disk Per Span: " + str(diskperspan) + " Raid Type: " + str(raidtype))
    return [id, raidtype, size, strpsz, properties, dskcache, state, targetid, cachecade_info, inprogress]


def returnDiskInfo(output, controllerid):
    arrayid = False
    arrayindex = -1
    sarrayid = "Unknown"
    diskid = False
    oldenclid = False
    enclid = False
    spanid = False
    slotid = False
    lsidid = "Unknown"
    table = []
    fstate = "Offline"
    substate = "Unknown"
    model = "Unknown"
    speed = "Unknown"
    dsize = "Unknown"
    temp = "Unk0C"
    percent = 0
    for line in output:
        if re.match(r"^Span: [0-9]+ - Number of PDs:", line.strip()):
            spanid = line.split(":")[1].strip()
            spanid = re.sub(" - Number of PDs.*", "", spanid)
        elif re.match(r"Enclosure Device ID: .*$", line.strip()):
            # We match here early in the analysis so reset the vars if this is a new disk we're reading..
            oldenclid = enclid
            enclid = line.split(":")[1].strip().replace("N/A", "")
            if oldenclid != False:
                fstate = "Offline"
                model = "Unknown"
                speed = "Unknown"
                temp = "Unk0C"
                slotid = False
                lsidid = "Unknown"
        elif re.match(r"^Coerced Size: ", line.strip()):
            dsize = line.split(":")[1].strip()
            dsize = re.sub(" \[.*\.*$", "", dsize)
            dsize = re.sub("[0-9][0-9] GB", " Gb", dsize)
        elif re.match(r"^(CacheCade )?Virtual (Disk|Drive): [0-9]+.*$", line.strip()):
            arrayindex += 1
            arrayid = line.split("(")[0].split(":")[1].strip()
        elif re.match(r"^Drive.s posi*tion: DiskGroup: [0-9]+,.*$", line.strip()):
            notarrayid = line.split(",")[1].split(":")[1].strip()
        elif re.match(r"PD: [0-9]+ Information.*$", line.strip()):
            diskid = line.split()[1].strip()
        elif re.match(r"^Device Id: .*$", line.strip()):
            lsidid = line.split(":")[1].strip()
        elif re.match(r"Slot Number: .*$", line.strip()):
            slotid = line.split(":")[1].strip()
        elif re.match(r"Firmware state: .*$", line.strip()):
            fstate = line.split(":")[1].strip()
            subfstate = re.sub("\(.*", "", fstate)
            dbgprint("Firmware State: " + str(fstate) + " " + str(subfstate))
        elif re.match(r"Inquiry Data: .*$", line.strip()):
            model = line.split(":")[1].strip()
            model = re.sub(" +", " ", model)

            # re-define our "sub-code"
            # our seagate drives have an ID string of
            # 'Z1E19S2QST2000DM001-1CH164                      CC43'
            # or
            # '6XW02738ST32000542AS                            CC32'

            m = re.match(r"(\w{8})(ST\w+)(?:-(\w{6}))?(?:\s+(\w+))", model)
            if m:
                if m.group(3):
                    model = "{0}-{1} {2} {3}".format(m.group(2), m.group(3), m.group(4), m.group(1))
                else:
                    model = "{0} {1:>10} {2}".format(m.group(2), m.group(4), m.group(1))
                continue

            # Sub code
            manuf = re.sub(" .*", "", model)
            dtype = re.sub(manuf + " ", "", model)
            dtype = re.sub(" .*", "", dtype)
            hwserial = re.sub(".*" + dtype + " *", "", model)
        elif re.match(r"^Media Type: .*$", line.strip()):
            mtype = line.split(":")[1].strip()
            if mtype == "Hard Disk Device":
                mtype = "HDD"
            else:
                if mtype == "Solid State Device":
                    mtype = "SSD"
                else:
                    mtype = "N/A"
        elif re.match(r"Device Speed: .*$", line.strip()):
            speed = line.split(":")[1].strip()
        elif re.match(r"Drive Temperature :.*$", line.strip()):
            if notempmode:
                temp = "N/A"
            else:
                # Drive temp is amongst the last few lines matched, decide here if we add information to the table..
                temp = line.split(":")[1].strip()
                temp = re.sub(" \(.*\)", "", temp)
            if model != "Unknown":
                dbgprint("Disk Info: " + str(arrayid) + " " + str(diskid) + " " + str(oldenclid))
                if subfstate == "Rebuild":
                    cmd = "%s pdrbld -showprog -physdrv\[%s:%s\] -a%d -NoLog" % (megaclipath, enclid, slotid, controllerid)
                    output = getOutput(cmd)
                    percent = returnRebuildProgress(output)
                    fstate = str("Rebuilding (%d%%)" % (percent))

                if (NestedLDTable[controllerid][int(arrayindex)] == True) and (spanid != False):
                    sarrayid = str(arrayid) + "s" + spanid
                else:
                    sarrayid = str(arrayid)
                table.append([sarrayid, str(diskid), mtype, model, dsize, fstate, speed, temp, enclid, slotid, lsidid])
    return table


def returnUnconfDiskInfo(output, controllerid):
    arrayid = False
    diskid = False
    olddiskid = False
    enclid = False
    slotid = False
    lsidid = "Unknown"
    table = []
    fstate = "Offline"
    substate = "Unknown"
    model = "Unknown"
    speed = "Unknown"
    mtype = "Unknown"
    dsize = "Unknown"
    temp = "Unk0C"
    ospath = "N/A"
    for line in output:
        if re.match(r"Enclosure Device ID: .*$", line.strip()):
            # We match here early in the analysis so reset the vars if this is a new disk we're reading..
            oldenclid = enclid
            enclid = line.split(":")[1].strip().replace("N/A", "")
            if oldenclid != False:
                arrayid = False
                fstate = "Offline"
                model = "Unknown"
                speed = "Unknown"
                temp = "Unk0C"
                slotid = False
                lsidid = "Unknown"

        elif re.match(r"^Coerced Size: ", line.strip()):
            dsize = line.split(":")[1].strip()
            dsize = re.sub(" \[.*\.*$", "", dsize)
            dsize = re.sub("[0-9][0-9] GB", " Gb", dsize)
        elif re.match(r"^Drive.s posi*tion: DiskGroup: [0-9]+,.*$", line.strip()):
            arrayid = line.split(",")[1].split(":")[1].strip()
        elif re.match(r"^Device Id: [0-9]+.*$", line.strip()):
            diskid = line.split(":")[1].strip()
        elif re.match(r"Slot Number: .*$", line.strip()):
            slotid = line.split(":")[1].strip()
        elif re.match(r"Firmware state: .*$", line.strip()):
            fstate = line.split(":")[1].strip()
            subfstate = re.sub("\(.*", "", fstate)
            dbgprint("Firmware State: " + str(fstate) + " " + str(subfstate))
        elif re.match(r"Inquiry Data: .*$", line.strip()):
            model = line.split(":")[1].strip()
            model = re.sub(" +", " ", model)

            # re-define our "sub-code"
            # our seagate drives have an ID string of
            # 'Z1E19S2QST2000DM001-1CH164                      CC43'
            # or
            # '6XW02738ST32000542AS                            CC32'

            m = re.match(r"(\w{8})(ST\w+)(?:-(\w{6}))?(?:\s+(\w+))", model)
            if m:
                if m.group(3):
                    model = "{0}-{1} {2} {3}".format(m.group(2), m.group(3), m.group(4), m.group(1))
                else:
                    model = "{0} {1:>10} {2}".format(m.group(2), m.group(4), m.group(1))
                continue

            manuf = re.sub(" .*", "", model)
            dtype = re.sub(manuf + " ", "", model)
            dtype = re.sub(" .*", "", dtype)
            hwserial = re.sub(".*" + dtype + " *", "", model)
        elif re.match(r"^Media Type: .*$", line.strip()):
            mtype = line.split(":")[1].strip()
            if mtype == "Hard Disk Device":
                mtype = "HDD"
            else:
                if mtype == "Solid State Device":
                    mtype = "SSD"
                else:
                    mtype = "N/A"
        elif re.match(r"Device Speed: .*$", line.strip()):
            speed = line.split(":")[1].strip()
        elif re.match(r"Drive Temperature :.*$", line.strip()):
            # Drive temp is amongst the last few lines matched, decide here if we add information to the table..
            if notempmode:
                temp = "N/A"
            else:
                temp = line.split(":")[1].strip()
                temp = re.sub("\(.*\)", "", temp)
            if arrayid == False:
                if subfstate == "Unconfigured":
                    dbgprint("Unconfigured Disk: Arrayid: " + str(arrayid) + " DiskId: " + str(diskid) + " " + str(olddiskid) + " " + str(fstate))
                elif subfstate == "Online, Spun Up":
                    dbgprint("Online Unconfed Disk: Arrayid: " + str(arrayid) + " DiskId: " + str(diskid) + " " + str(olddiskid) + " " + str(fstate))
                table.append([mtype, model, dsize, fstate, speed, temp, enclid, slotid, diskid, ospath])
    return table


cmd = "%s -adpCount -NoLog" % (megaclipath)
output = getOutput(cmd)
controllernumber = returnControllerNumber(output)

bad = False

# List available controller
if printcontroller:
    if controllernumber:
        if not nagiosmode:
            print("-- Controller information --")

        i = 0
        controllerid = 0
        mlen = 0
        hbainfo = []
        while controllerid < controllernumber:
            cmd = "%s -AdpAllInfo -a%d -NoLog" % (megaclipath, controllerid)
            output = getOutput(cmd)
            returnHBAInfo(hbainfo, output, controllerid)
            controllerid += 1
        mlen = returnWdthFromArrayCol(hbainfo, 1)

        controllerid = 0
        for hba in hbainfo:
            hbafmt = str("%-5s | %-" + str(mlen) + "s | %-6s | %-4s | %-6s | %-12s ")
            # Header
            if i == 0:
                if not nagiosmode:
                    print(hbafmt % ("-- ID", "H/W Model", "RAM", "Temp", "BBU", "Firmware"))
            if not nagiosmode:
                print(hbafmt % (hba[0], hba[1], hba[2], hba[3], hba[4], hba[5]))
            i += 1
        if not nagiosmode:
            print("")
    else:
        print("No MegaRAID or PERC adapter detected on your system!")
        exit(1)

if printarray:
    if not nagiosmode:
        print("-- Array information --")

    controllerid = 0
    pcipath = ""
    diskpath = ""
    i = 0
    j = 0
    mlen = 0
    rlen = 0
    clen = 0
    while controllerid < controllernumber:
        arrayindex = 0

        cmd = "%s -LDInfo -lall -a%d -NoLog" % (megaclipath, controllerid)
        output = getOutput(cmd)
        arraynumber = returnArrayNumber(output)
        # We need to explore each HBA to look for gaps in LD's
        ldid = 0
        ldcount = 0
        while ldcount < arraynumber:
            cmd = "%s -LDInfo -l%d -a%d -NoLog" % (megaclipath, ldid, controllerid)
            output = getOutput(cmd)
            for line in output:
                if re.match(r"^Adapter.*Virtual Drive .* Does not Exist", line.strip()):
                    ldid += 1
                elif re.match(r"^(CacheCade )?Virtual Drive:", line.strip()):
                    LDTable[controllerid].append(ldid)
                    # NestedLDTable[controllerid][int(arrayindex)] = False
                    ldcount += 1
                    ldid += 1

        while arrayindex < arraynumber:
            ldid = LDTable[controllerid][arrayindex]
            cmd = "%s -LDInfo -l%d -a%d -NoLog" % (megaclipath, ldid, controllerid)
            output = getOutput(cmd)
            arrayinfo = returnArrayInfo(output, controllerid, ldid, arrayindex)
            if len(arrayinfo[1]) > rlen:
                rlen = len(arrayinfo[1])
            if len(arrayinfo[4]) > mlen:
                mlen = len(arrayinfo[4])
            if len(arrayinfo[8]) > clen:
                clen = len(arrayinfo[8])
            arrayindex += 1
        controllerid += 1

    controllerid = 0
    while controllerid < controllernumber:
        arrayindex = 0

        cmd = "%s -AdpGetPciInfo -a%d -NoLog" % (megaclipath, controllerid)
        output = getOutput(cmd)
        pcipath = returnHBAPCIInfo(output)

        cmd = "%s -LDInfo -lall -a%d -NoLog" % (megaclipath, controllerid)
        output = getOutput(cmd)
        arraynumber = returnArrayNumber(output)
        while arrayindex < arraynumber:
            ldid = LDTable[controllerid][arrayindex]
            cmd = "%s -LDInfo -l%d -a%d -NoLog" % (megaclipath, ldid, controllerid)
            output = getOutput(cmd)
            arrayinfo = returnArrayInfo(output, controllerid, ldid, arrayindex)

            if pcipath:
                diskprefix = str("/dev/disk/by-path/pci-" + pcipath + "-scsi-0:")
                dbgprint("Will look for DISKprefix : " + diskprefix)
                # RAID disks are usually with a channel of '2', JBOD disks with a channel of '0'
                for j in range(1, 8):
                    diskpath = diskprefix + str(j) + ":" + str(arrayinfo[7]) + ":0"
                    dbgprint("Looking for DISKpath : " + diskpath)
                    if os.path.exists(diskpath):
                        arrayinfo[7] = os.path.realpath(diskpath)
                        dbgprint("Found DISK match: " + diskpath + " -> " + arrayinfo[7])
                        break
            else:
                arrayinfo[7] = "N/A"

            # Pad the string length, just to make sure it's aligned with the headers...
            if rlen < len("Type"):
                rlen = len("Type")
            if mlen < len("Flags"):
                mlen = len("Flags")
            if clen < len("CacheCade"):
                clen = len("CacheCade")

            ldfmt = str("%-5s | %-" + str(rlen) + "s | %7s | %7s | %" + str(mlen) + "s | %8s | %8s | %8s | %-" + str(clen) + "s |%-12s ")
            # Header
            if i == 0:
                if not nagiosmode:
                    print(ldfmt % ("-- ID", "Type", "Size", "Strpsz", "Flags", "DskCache", "Status", "OS Path", "CacheCade", "InProgress"))
            if not nagiosmode:
                print(
                    ldfmt
                    % (
                        arrayinfo[0],
                        arrayinfo[1],
                        arrayinfo[2],
                        arrayinfo[3],
                        arrayinfo[4],
                        arrayinfo[5],
                        arrayinfo[6],
                        arrayinfo[7],
                        arrayinfo[8],
                        arrayinfo[9],
                    )
                )
            dbgprint("Array state : LD " + arrayinfo[0] + ", status : " + arrayinfo[6])
            if arrayinfo[6] not in ["Optimal", "N/A"]:
                bad = True
                nagiosbadarray += 1
            else:
                nagiosgoodarray += 1
            arrayindex += 1
            i += 1
        controllerid += 1
    if not nagiosmode:
        print("")

controllerid = 0
while controllerid < controllernumber:
    cmd = "%s -PDGetNum -a%d -NoLog" % (megaclipath, controllerid)
    output = getOutput(cmd)
    totaldrivenumber += returnTotalDriveNumber(output)
    controllerid += 1

if totaldrivenumber:
    if not nagiosmode:
        print("-- Disk information --")

    i = 0
    dlen = 0
    mlen = 0
    flen = 0
    controllerid = 0
    while controllerid < controllernumber:
        arrayid = 0
        cmd = "%s -LDInfo -lall -a%d -NoLog" % (megaclipath, controllerid)
        output = getOutput(cmd)
        arraynumber = returnArrayNumber(output)
        #### BUG: -LdPdInfo shows all PD on the adapter, not just for the LD we wanted..
        #### while arrayid <= arraynumber:
        cmd = "%s -LdPdInfo -a%d -NoLog" % (megaclipath, controllerid)
        output = getOutput(cmd)
        arraydisk = returnDiskInfo(output, controllerid)
        for array in arraydisk:
            diskname = str(controllerid) + array[8] + array[9]
            dbgprint("Disk c" + diskname + " status : " + array[5])
            if re.match("|".join(["^Online$", "^Online, Spun Up$", "^Rebuilding \(.*"]), array[5]):
                if AddDisk(NagiosGoodDisks, diskname):
                    nagiosgooddisk += 1
            else:
                bad = True
                if AddDisk(NagiosBadDisks, diskname):
                    nagiosbaddisk += 1

        if returnWdthFromArrayCol(arraydisk, 0) > dlen:
            dlen = returnWdthFromArrayCol(arraydisk, 0)
        if returnWdthFromArrayCol(arraydisk, 3) > mlen:
            mlen = returnWdthFromArrayCol(arraydisk, 3)
        if returnWdthFromArrayCol(arraydisk, 5) > flen:
            flen = returnWdthFromArrayCol(arraydisk, 5)
        controllerid += 1

    controllerid = 0
    while controllerid < controllernumber:
        arrayid = 0

        cmd = "%s -LDInfo -lall -a%d -NoLog" % (megaclipath, controllerid)
        output = getOutput(cmd)
        arraynumber = returnArrayNumber(output)
        #### BUG: -LdPdInfo shows all PD on the adapter, not just for said LD..
        #### while arrayid <= arraynumber:

        cmd = "%s -LdPdInfo -a%d -NoLog" % (megaclipath, controllerid)
        output = getOutput(cmd)
        arraydisk = returnDiskInfo(output, controllerid)

        # Adjust print format with width computed above
        drvfmt = "%-" + str(dlen + 6) + "s | %-4s | %-" + str(mlen) + "s | %-8s | %-" + str(flen) + "s | %-8s | %-4s | %-8s | %-8s"
        for array in arraydisk:
            # Header
            if i == 0:
                if not nagiosmode:
                    print(drvfmt % ("-- ID", "Type", "Drive Model", "Size", "Status", "Speed", "Temp", "Slot ID", "LSI ID"))
            # Drive information
            if not nagiosmode:
                print(
                    drvfmt
                    % (
                        str("c" + str(controllerid) + "u" + array[0] + "p" + array[1]),  # c0p0
                        array[2],  # HDD/SDD
                        array[3],  # Model Information (Variable len)
                        array[4],  # Size
                        array[5],  # Status (Variable len)
                        array[6],  # Speed
                        array[7],  # Temp
                        str("[" + array[8] + ":" + array[9] + "]"),  # Slot ID
                        array[10],
                    )
                )  # LSI ID
            i = i + 1
        controllerid += 1
    if not nagiosmode:
        print("")

controllerid = 0
totalconfdrivenumber = 0
totalunconfdrivenumber = 0
totaldrivenumber = 0
while controllerid < controllernumber:
    cmd = "%s -LdPdInfo -a%d -NoLog" % (megaclipath, controllerid)
    output = getOutput(cmd)
    totalconfdrivenumber += returnConfDriveNumber(controllerid, output)

    cmd = "%s -PDGetNum -a%d -NoLog" % (megaclipath, controllerid)
    output = getOutput(cmd)
    totaldrivenumber += returnTotalDriveNumber(output)

    cmd = "%s -PDList -a%d -NoLog" % (megaclipath, controllerid)
    output = getOutput(cmd)
    # Sometimes a drive will be reconfiguring without any info on that it is going through a rebuild process.
    # This happens when expanding an R{5,6,50,60} array, for example. In that case, totaldrivenumber will still be
    # greater than totalconfdrivenumber while returnUnConfDriveNumber(output) will be zero. The math below attempts to solve this.
    totalunconfdrivenumber += max(returnUnConfDriveNumber(output), totaldrivenumber - totalconfdrivenumber)

    controllerid += 1

dbgprint("Total Drives in system : " + str(totaldrivenumber))
dbgprint("Total Configured Drives : " + str(totalconfdrivenumber))
dbgprint("Total Unconfigured Drives : " + str(totalunconfdrivenumber))

if totalunconfdrivenumber:
    if not nagiosmode:
        print("-- Unconfigured Disk information --")

    controllerid = 0
    pcipath = ""
    while controllerid < controllernumber:
        arrayid = 0

        cmd = "%s -LDInfo -lall -a%d -NoLog" % (megaclipath, controllerid)
        output = getOutput(cmd)
        arraynumber = returnArrayNumber(output)
        cmd = "%s -AdpGetPciInfo -a%d -NoLog" % (megaclipath, controllerid)
        output = getOutput(cmd)
        pcipath = returnHBAPCIInfo(output)
        #### BUG: -LdPdInfo shows all PD on the adapter, not just for given LD..
        #### while arrayid <= arraynumber:

        cmd = "%s -PDList -a%d -NoLog" % (megaclipath, controllerid)
        output = getOutput(cmd)
        arraydisk = returnUnconfDiskInfo(output, controllerid)
        for array in arraydisk:
            dbgprint("Unconfed " + str(nagiosgooddisk) + "/" + str(nagiosbaddisk) + " Disk c" + str(controllerid) + "uXpY status : " + array[3])
            if array[3] in [
                "Online",
                "Unconfigured(good), Spun Up",
                "Unconfigured(good), Spun down",
                "JBOD",
                "Hotspare, Spun Up",
                "Hotspare, Spun down",
                "Online, Spun Up",
            ]:
                nagiosgooddisk += 1
            else:
                bad = True
                nagiosbaddisk += 1

            # JBOD disks has a real device path and are not masked. Try to find a device name here, if possible.
            if pcipath:
                if array[3] in ["JBOD"]:
                    diskprefix = str("/dev/disk/by-path/pci-" + pcipath + "-scsi-0:0:")
                    dbgprint("Will look for DISKprefix : " + diskprefix)
                    # RAID disks are usually with a channel of '2', JBOD disks with a channel of '0'
                    diskpath = diskprefix + str(array[8]) + ":0"
                    dbgprint("Looking for DISKpath : " + diskpath)
                    if os.path.exists(diskpath):
                        dbgprint("Found DISK match: " + diskpath + " -> " + array[9])
                        array[9] = os.path.realpath(diskpath)
                    else:
                        dbgprint("DISK NOT present: " + diskpath)
                        array[9] = "N/A"

        mlen = returnWdthFromArrayCol(arraydisk, 1)
        flen = returnWdthFromArrayCol(arraydisk, 3)

        # Adjust print format with widths computed above
        drvfmt = "%-7s | %-4s | %-" + str(mlen) + "s | %-8s | %-" + str(flen + 2) + "s | %-8s | %-4s | %-8s | %-6s | %-8s"
        i = 0
        for array in arraydisk:
            # Header
            if i == 0:
                if not nagiosmode:
                    print(drvfmt % ("-- ID", "Type", "Drive Model", "Size", "Status", "Speed", "Temp", "Slot ID", "LSI ID", "Path"))
            # Drive information
            if not nagiosmode:
                print(
                    drvfmt
                    % (
                        str("c" + str(controllerid) + "uXpY"),  # cXpY
                        array[0],  # HDD/SDD
                        array[1],  # Model Information (Variable len)
                        array[2],  # Size
                        array[3],  # Status (Variable len)
                        array[4],  # Speed
                        array[5],  # Temp
                        str("[" + array[6] + ":" + array[7] + "]"),  # Slot ID
                        array[8],  # LSI ID
                        array[9],
                    )
                )  # OS path, if any
            i += 1
        controllerid += 1
    if not nagiosmode:
        print("")

if debugmode:
    dbgprint("Printing Outputs[][]")
    for myl in Outputs:
        dbgprint(myl + "\n")
        sys.stderr.write("\n".join("".join(map(str, myd)) for myd in Outputs[myl]) + "\n")
    dbgprint("Printing arraydisk[]")
    sys.stderr.write("\n".join(" | ".join(map(str, myd)) for myd in arraydisk) + "\n")
    dbgprint("Printing ConfDisks[]")
    sys.stderr.write("\n".join("".join(map(str, myd)) for myd in ConfDisks) + "\n")
    dbgprint("Printing NagiosGoodDisks[]")
    sys.stderr.write("\n".join("".join(map(str, myd)) for myd in NagiosGoodDisks) + "\n")
    dbgprint("Printing NagiosBadDisks[]")
    sys.stderr.write("\n".join("".join(map(str, myd)) for myd in NagiosBadDisks) + "\n")

if nagiosmode:
    if bad:
        print(
            "RAID ERROR - Arrays: OK:"
            + str(nagiosgoodarray)
            + " Bad:"
            + str(nagiosbadarray)
            + " - Disks: OK:"
            + str(nagiosgooddisk)
            + " Bad:"
            + str(nagiosbaddisk)
        )
        sys.exit(2)
    else:
        print(
            "RAID OK - Arrays: OK:"
            + str(nagiosgoodarray)
            + " Bad:"
            + str(nagiosbadarray)
            + " - Disks: OK:"
            + str(nagiosgooddisk)
            + " Bad:"
            + str(nagiosbaddisk)
        )
else:
    if bad:
        # DO NOT MODIFY OUTPUT BELOW
        # Scripts may relies on it
        # https://github.com/eLvErDe/hwraid/issues/99
        print("\nThere is at least one disk/array in a NOT OPTIMAL state.")
        print(
            "RAID ERROR - Arrays: OK:"
            + str(nagiosgoodarray)
            + " Bad:"
            + str(nagiosbadarray)
            + " - Disks: OK:"
            + str(nagiosgooddisk)
            + " Bad:"
            + str(nagiosbaddisk)
        )
        sys.exit(1)

保存为 megaclisas-status.py ,执行 python megaclisas-status.py 运行结果,可能页面显示看的比较遭,暂时没时间鼓捣,可以复制内容去文本全屏看的比较直观:

# python megaclisas-status.py 
[0:2:0:0]    disk    DELL     PERC H710P       3.13  /dev/sda 
[0:2:1:0]    disk    DELL     PERC H710P       3.13  /dev/sdb 
[0:2:2:0]    disk    DELL     PERC H710P       3.13  /dev/sdc 
[0:2:3:0]    disk    DELL     PERC H710P       3.13  /dev/sdd 
[0:2:4:0]    disk    DELL     PERC H710P       3.13  /dev/sde 
[0:2:5:0]    disk    DELL     PERC H710P       3.13  /dev/sdf 
[0:2:6:0]    disk    DELL     PERC H710P       3.13  /dev/sdg 
[0:2:7:0]    disk    DELL     PERC H710P       3.13  /dev/sdh 
-- Controller information --
-- ID | H/W Model       | RAM    | Temp | BBU    | Firmware     
c0    | PERC H710P Mini | 1024MB | 45C  | Good   | FW: 21.3.1-0004 

-- Array information --
-- ID | Type   |    Size |  Strpsz |   Flags | DskCache |   Status |  OS Path | CacheCade        |InProgress   
c0u0  | RAID-0 |    232G |   64 KB | ADRA,WB | Disabled |  Optimal |        0 | None             |None         
c0u1  | RAID-0 |   3637G |   64 KB | ADRA,WB | Disabled |  Optimal |        1 | Type : Read Only |None         
c0u2  | RAID-0 |   3637G |   64 KB | ADRA,WB | Disabled |  Optimal |        2 | Type : Read Only |None         
c0u3  | RAID-0 |   3637G |   64 KB | ADRA,WB | Disabled |  Optimal |        3 | Type : Read Only |None         
c0u4  | RAID-0 |   3637G |   64 KB | ADRA,WB | Disabled |  Optimal |        4 | Type : Read Only |None         
c0u5  | RAID-0 |   3637G |   64 KB | ADRA,WB | Disabled |  Optimal |        5 | Type : Read Only |None         
c0u6  | RAID-0 |   3637G |   64 KB | ADRA,WB | Disabled |  Optimal |        6 | Type : Read Only |None         
c0u7  | RAID-0 |  12732G |   64 KB | ADRA,WB | Disabled |  Optimal |        7 | Type : Read Only |None         

-- Disk information --
-- ID   | Type | Drive Model                                        | Size     | Status          | Speed    | Temp | Slot ID  | LSI ID  
c0u0p0  | SSD  | S4CKNF0NC22284X Samsung SSD 860 EVO 250GB RVT04B6Q | 232.3 Gb | Online, Spun Up | 6.0Gb/s  | 30C  | [32:0]   | 0       
c0u1p0  | HDD  | ST4000NM0035       SN07 WE26UXL9                   | 3.637 TB | Online, Spun Up | 6.0Gb/s  | 32C  | [32:1]   | 1       
c0u2p0  | HDD  | ST4000NM0035       SN07 WE26UXLP                   | 3.637 TB | Online, Spun Up | 6.0Gb/s  | 33C  | [32:2]   | 2       
c0u3p0  | HDD  | ST4000NM0035       SN07 WE28V2ST                   | 3.637 TB | Online, Spun Up | 6.0Gb/s  | 32C  | [32:4]   | 4       
c0u4p0  | HDD  | ST4000NM0035       SN07 WE28V2UL                   | 3.637 TB | Online, Spun Up | 6.0Gb/s  | 30C  | [32:5]   | 5       
c0u5p0  | HDD  | ST4000NM000A       SN07 WS21NA19                   | 3.637 TB | Online, Spun Up | 6.0Gb/s  | 31C  | [32:3]   | 3       
c0u6p0  | HDD  | VBH8A06F HGST HUS726T4TALE6L4 VKGNW9G0             | 3.637 TB | Online, Spun Up | 6.0Gb/s  | 27C  | [32:6]   | 6       
c0u7p0  | HDD  | Y6GD3X2C WDC WUH721414ALE6L4 LDGNW240              | 12.732 TB | Online, Spun Up | 6.0Gb/s  | 25C  | [32:7]   | 7    

例如要找/dev/sdd,看lsscsi的输出可以知道在系统中的sdd的信息“0:2:3:0”,然后看Array information 中 OS Path 3的ID是“c0u3”,接着看Disk information中“c0u3p0”中磁盘盘位是4。

这批有问题的盘是以前国内淘宝买的,真是被骗的好惨……说是全新,刚买回来测验也没问题,码也没问题,刚一年不到就这样了,基本可以确定是贴标的盘,造假技术太强了,现在硬盘带不回国只能扔掉了。湾仔电脑城买的延迟、速度就很正常,价钱高一些但可以确保是正品,坏了也可以马上拿去保修换新的。硬盘千万不要贪便宜淘宝买,要找专门经销商,检查好他们的资质,一定小心被骗。

标签: none

添加新评论