背景

今天做七牛相关接口开发的时候发现七牛的文件查询列表返回,有个hash字段,但是不知道是怎么进行计算的。后来查询到七牛的官方仓库 qetag.官方对这个算法的描述是这样的:

qetag 是一个计算文件在七牛云存储上的 hash 值(也是文件下载时的 etag 值)的实用程序。

七牛的 hash/etag 算法是公开的。算法大体如下:

  • 如果你能够确认文件 <= 4M,那么 hash = UrlsafeBase64([0x16, sha1(FileContent)])。也就是,文件的内容的sha1值(20个字节),前面加一个byte(值为0x16),构成 21 字节的二进制数据,然后对这 21 字节的数据做 urlsafe 的 base64 编码。
  • 如果文件 > 4M,则 hash = UrlsafeBase64([0x96, sha1([sha1(Block1), sha1(Block2), …])]),其中 Block 是把文件内容切分为 4M 为单位的一个个块,也就是 BlockI = FileContent[I*4M:(I+1)*4M]

为何需要公开 hash/etag 算法?这个和 “消重” 问题有关,详细见:

为何在 sha1 值前面加一个byte的标记位(0x16或0x96)?

  • 0x16 = 22,而 2^22 = 4M。所以前面的 0x16 其实是文件按 4M 分块的意思。
  • 0x96 = 0x80 | 0x16。其中的 0x80 表示这个文件是大文件(有多个分块),hash 值也经过了2重的 sha1 计算。

语言封装

C# 实现

基于官方仓库的csharp代码做了部分修改

public static class QETag
{
	const  int CHUNK_SIZE = 1 << 22;

	private static byte[] sha1(byte[] data)
	{
		return System.Security.Cryptography.SHA1.Create().ComputeHash(data);
	}

	private static String urlSafeBase64Encode(byte[] data)
	{
		String encodedString = Convert.ToBase64String(data);
		encodedString = encodedString.Replace('+', '-').Replace('/', '_');
		return encodedString;
	}

	public static String calcETag(String path)
	{
	    string etag = string.Empty;
		using (FileStream fs = File.OpenRead(path))
		{
			long fileLength = fs.Length;
			if (fileLength <= CHUNK_SIZE)
			{
				byte[] fileData = new byte[(int)fileLength];
				fs.Read(fileData, 0, (int)fileLength);
				byte[] sha1Data = sha1(fileData);
				int sha1DataLen = sha1Data.Length;
				byte[] hashData = new byte[sha1DataLen + 1];

				System.Array.Copy(sha1Data, 0, hashData, 1, sha1DataLen);
				hashData[0] = 0x16;
				etag = urlSafeBase64Encode(hashData);
			}
			else
			{
				int chunkCount = (int)(fileLength / CHUNK_SIZE);
				if (fileLength % CHUNK_SIZE != 0)
				{
					chunkCount += 1;
				}
				byte[] allSha1Data = new byte[0];
				for (int i = 0; i < chunkCount; i++)
				{
					byte[] chunkData = new byte[CHUNK_SIZE];
					int bytesReadLen = fs.Read(chunkData, 0, CHUNK_SIZE);
					byte[] bytesRead = new byte[bytesReadLen];
					System.Array.Copy(chunkData, 0, bytesRead, 0, bytesReadLen);
					byte[] chunkDataSha1 = sha1(bytesRead);
					byte[] newAllSha1Data = new byte[chunkDataSha1.Length
							+ allSha1Data.Length];
					System.Array.Copy(allSha1Data, 0, newAllSha1Data, 0,
							allSha1Data.Length);
					System.Array.Copy(chunkDataSha1, 0, newAllSha1Data,
							allSha1Data.Length, chunkDataSha1.Length);
					allSha1Data = newAllSha1Data;
				}
				byte[] allSha1DataSha1 = sha1(allSha1Data);
				byte[] hashData = new byte[allSha1DataSha1.Length + 1];
				System.Array.Copy(allSha1DataSha1, 0, hashData, 1,
						allSha1DataSha1.Length);
				hashData[0] = (byte)0x96;
				etag = urlSafeBase64Encode(hashData);
			}
		}
		return etag;

	}
}

rust实现

使用ai生成

use std::io::{self, Read, BufReader};
use std::fs::File;
use sha1::Sha1;
use rustc_serialize::base64::{self, ToBase64, URL_SAFE};

const BLOCK_BITS: u64 = 22;
const BLOCK_SIZE: i64 = 1 << BLOCK_BITS;

fn block_count(fsize: i64) -> i64 {
    (fsize + (BLOCK_SIZE - 1)) >> BLOCK_BITS
}

fn cal_sha1(mut r: impl Read) -> io::Result<Vec<u8>> {
    let mut buffer = Vec::new();
    let mut h = Sha1::new();
    io::copy(&mut r, &mut h)?;
    buffer.extend(h.digest().bytes());
    Ok(buffer)
}

fn get_etag(filename: &str) -> io::Result<String> {
    let f = File::open(filename)?;
    let fsize = f.metadata()?.len() as i64;

    let mut sha1_buf = Vec::new();
    let mut r = BufReader::new(&f);

    if block_count(fsize) <= 1 {
        sha1_buf.push(0x16);
        sha1_buf.extend(cal_sha1(&mut r)?);
    } else {
        sha1_buf.push(0x96);
        let mut sha1_block_buf = Vec::new(); 
        
        for _i in 0..block_count(fsize) {
            let mut body = r.by_ref().take(BLOCK_SIZE as u64);
            sha1_block_buf.extend(cal_sha1(&mut body)?);
        }

        sha1_buf.extend(cal_sha1(io::Cursor::new(sha1_block_buf))?);
    }
    Ok(sha1_buf.to_base64(URL_SAFE))
}

fn main() -> io::Result<()>{
    let etag = get_etag("your_file_path")?;

    println!("{}", etag);
    Ok(())
}

你需要添加以下依赖到你的Cargo.toml文件中:

[dependencies]
sha1 = "0.6.0"
rustc-serialize = "0.3"

Dart实现

ai生成

import 'dart:io';
import 'dart:convert';
import 'package:crypto/crypto.dart';
import 'dart:math';

const int BLOCK_BITS = 22;
const int BLOCK_SIZE = 1 << BLOCK_BITS;


int blockCount(int fsize) {
  return ((fsize + (BLOCK_SIZE-1)) >> BLOCK_BITS);
}

Future<List<int>> calSha1(List<int> b, Stream<List<int>> r) async {

  Digest digest = await sha1.bind(r).first;
  b.addAll(digest.bytes);
  return b;
}

Future<String> getEtag(String filename) async {

  var f = File(filename);
  await f.open();
  int fsize = await f.length();
  
  int blockCnt = blockCount(fsize);
  List<int> sha1Buf = [];

  var openReadStream = f.openRead();
  if (blockCnt <= 1) { // file size <= 4M
    sha1Buf.add(0x16);
    sha1Buf = await calSha1(sha1Buf, openReadStream); 
  } else { // file size > 4M
    sha1Buf.add(0x96);
    List<int> sha1BlockBuf = [];
    for (int i = 0; i < blockCnt; i ++) {
      var body = openReadStream.take(BLOCK_SIZE);
      sha1BlockBuf = await calSha1(sha1BlockBuf, body);
    }
    sha1Buf = await calSha1(sha1Buf, Stream.fromIterable([sha1BlockBuf]));
  }
  String etag = base64UrlEncode(sha1Buf);
  return etag;
}

void main() async {
  String etag = await getEtag('your_file_path');
  print(etag);
}

请注意, 这段代码需要 ‘crypto’ 这个库来实现sha1的计算. 你可以通过在pubspec.yaml文件添加以下一行来获取这个库:

dependencies:
  crypto: any