Python下判断文件是否为二进制文件的三种方法

常用的有两种方法判断文件是否为二进制文件,最准确的就是把这两种方法结合起来更加准确点.

方法1利用codecs模块

它首先检查文件是否以BOM开始,如果不在初始8192字节内查找零字节:

import codecs

file_path = "/home/ubuntu/zgd/ztest/_gs418_510txp_v6.6.2.7.stk.extracted/test"

#: BOMs to indicate that a file is a text file even if it contains zero bytes.
_TEXT_BOMS = (
    codecs.BOM_UTF16_BE,
    codecs.BOM_UTF16_LE,
    codecs.BOM_UTF32_BE,
    codecs.BOM_UTF32_LE,
    codecs.BOM_UTF8,
    )


def is_binary_file(file_path):
    with open(file_path, 'rb') as file:
        initial_bytes = file.read(8192)
        file.close()
    return not any(initial_bytes.startswith(bom) for bom in _TEXT_BOMS) and b'\0' in initial_bytes

if __name__ == "__main__":
    print is_binary_file(file_path)

上面is_binary_file()函数也可以改成下面的方式:

def is_binary_file(file_path):
    with open(file_path, 'rb') as file:
        initial_bytes = file.read(8192)
        file.close()
        for bom in _TEXT_BOMS:
            if initial_bytes.startswith(bom):
                continue
            else:
                if b'\0' in initial_bytes:
                    return True
    return False


方法2利用magic模块

安装模块: pip install python-magic

def getFileType(ff):
    mime_kw = 'x-executable|x-sharedlib|octet-stream|x-object'  ###可执行文件、链接库、动态流、对象
    try:
        magic_mime = magic.from_file(ff, mime=True)
        magic_hit = re.search(mime_kw, magic_mime, re.I)
        if magic_hit:
            return True
        else:
            return False
    except Exception, e:
        print e.message

最好的方法是对两种类型同时进行处理:

# -*- coding:utf-8 -*-
# @Author:zgd
# @time:2019/6/21
# @File:operateSystem.py

import magic
import re
import codecs

def is_binary_file_1(ff):
    '''
    根据text文件数据类型判断是否是二进制文件
    :param ff: 文件名(含路径)
    :return: True或False,返回是否是二进制文件
    '''
    TEXT_BOMS = (
        codecs.BOM_UTF16_BE,
        codecs.BOM_UTF16_LE,
        codecs.BOM_UTF32_BE,
        codecs.BOM_UTF32_LE,
        codecs.BOM_UTF8,
    )
    with open(file_path, 'rb') as file:
        CHUNKSIZE = 8192
        initial_bytes = file.read(CHUNKSIZE)
        file.close()
    #: BOMs to indicate that a file is a text file even if it contains zero bytes.
    return not any(initial_bytes.startswith(bom) for bom in TEXT_BOMS) and b'\0' in initial_bytes


def is_binwary_file_2(ff):
    '''
    根据magic文件的魔术判断是否是二进制文件
    :param ff: 文件名(含路径)
    :return: True或False,返回是否是二进制文件
    '''
    mime_kw = 'x-executable|x-sharedlib|octet-stream|x-object'  ###可执行文件、链接库、动态流、对象
    try:
        magic_mime = magic.from_file(ff, mime=True)
        magic_hit = re.search(mime_kw, magic_mime, re.I)
        if magic_hit:
            return True
        else:
            return False
    except Exception, e:
        return False


if __name__ == "__main__":
    file_path = "/home/ubuntu/zgd/ztest/_gs418_510txp_v6.6.2.7.stk.extracted/D0"
    print is_binary_file_1(file_path)
    print is_binwary_file_2(file_path)
    print any((is_binary_file_1(file_path), is_binwary_file_2(file_path)))


方法3判断是否有ELF头

根据文件中是否有ELF头进行判断文件是否为二进制文件

# 判断文件是否是elf文件
def is_ELFfile(filepath):
    if not os.path.exists(filepath):
        logger.info('file path {} doesnot exits'.format(filepath))
        return False
    # 文件可能被损坏,捕捉异常
    try:
        FileStates = os.stat(filepath)
        FileMode = FileStates[stat.ST_MODE]
        if not stat.S_ISREG(FileMode) or stat.S_ISLNK(FileMode):  # 如果文件既不是普通文件也不是链接文件
            return False
        with open(filepath, 'rb') as f:
            header = (bytearray(f.read(4))[1:4]).decode(encoding="utf-8")
            # logger.info("header is {}".format(header))
            if header in ["ELF"]:
                # print header
                return True
    except UnicodeDecodeError as e:
        # logger.info("is_ELFfile UnicodeDecodeError {}".format(filepath))
        # logger.info(str(e))
        pass

    return False

0 个评论

要回复文章请先登录注册