mirror of https://github.com/llvm/torch-mlir
Replace LCG algorithm with squares64 algorithm in AtenUniformOp (#1633)
This commit replaces the LCG algorithm that was being used by the `TorchToLinalg` lowering of `AtenUniformOp` to generate random numbers with the `squares64` algorithm, for the LCG algorithm was producing tensors that were highly correlated with one another. Squares64 algorithm: https://arxiv.org/abs/2004.06278 Closes https://github.com/llvm/torch-mlir/issues/1608pull/1657/head
parent
e66bf7b8cb
commit
b4b92c990e
|
@ -95,6 +95,7 @@ TORCHDYNAMO_XFAIL_SET = {
|
|||
"StdDimNoneDimModule_basic",
|
||||
"StdUnbiasedModule_basic",
|
||||
"UniformModule_basic",
|
||||
"UniformNoCorrelationModule_basic",
|
||||
# %1 = torch.operator "aten.scalar_tensor"(%float8.000000e00, %int6, %int0, %cpu, %none) : (!torch.float, !torch.int, !torch.int, !torch.Device, !torch.none) -> !torch.tensor
|
||||
"ElementwiseWhereScalarModule_basic",
|
||||
"ElementwiseWhereScalarOtherModule_basic",
|
||||
|
@ -750,6 +751,7 @@ LTC_XFAIL_SET = {
|
|||
"TensorToInt_basic",
|
||||
"TensorsConcatModule_basic",
|
||||
"UniformModule_basic",
|
||||
"UniformNoCorrelationModule_basic",
|
||||
"UnsafeViewCollapseDynamicWithAtenSizeIntModule_basic",
|
||||
"ViewCollapseDynamicWithAtenSizeIntModule_basic",
|
||||
"AtenEmbeddingBagSumExample_basic",
|
||||
|
|
|
@ -57,6 +57,61 @@ public:
|
|||
};
|
||||
} // namespace
|
||||
|
||||
static Value toLinearIndex(OpBuilder &b, Location loc,
|
||||
ArrayRef<Value> indicesIntValues,
|
||||
ArrayRef<Value> shapeIntValues) {
|
||||
assert(indicesIntValues.size() == shapeIntValues.size() &&
|
||||
"Expected `indices` and `shape` to have the same size");
|
||||
Value result =
|
||||
b.create<arith::ConstantOp>(loc, b.getZeroAttr(b.getI64Type()));
|
||||
for (auto [index, stride] : llvm::zip(indicesIntValues, shapeIntValues)) {
|
||||
assert(index.getType().isa<mlir::IntegerType>() &&
|
||||
stride.getType().isa<mlir::IntegerType>() &&
|
||||
"Input arrays to `toLinearIndex` must only contain values of type "
|
||||
"`mlir::IntegerType`");
|
||||
Value mul = b.create<arith::MulIOp>(loc, result, stride);
|
||||
result = b.create<arith::AddIOp>(loc, mul, index);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// Squares64 Algorithm for generating 64-bit random numbers.
|
||||
// See: https://arxiv.org/abs/2004.06278
|
||||
static Value randomUniformUInt(OpBuilder &b, Location loc, Value ctr,
|
||||
Value key) {
|
||||
auto mul = [&](Value lhs, Value rhs) -> Value {
|
||||
return b.create<arith::MulIOp>(loc, lhs, rhs);
|
||||
};
|
||||
auto add = [&](Value lhs, Value rhs) -> Value {
|
||||
return b.create<arith::AddIOp>(loc, lhs, rhs);
|
||||
};
|
||||
Value cst32 = b.create<arith::ConstantOp>(loc, b.getI64IntegerAttr(32));
|
||||
auto shiftRight32 = [&](Value val) -> Value {
|
||||
return b.create<arith::ShRUIOp>(loc, val, cst32);
|
||||
};
|
||||
auto swapLoHi = [&](Value val) -> Value {
|
||||
Value leftShift = b.create<arith::ShLIOp>(loc, val, cst32);
|
||||
Value rightShift = shiftRight32(val);
|
||||
return b.create<arith::OrIOp>(loc, leftShift, rightShift);
|
||||
};
|
||||
auto bitwiseXOr = [&](Value lhs, Value rhs) -> Value {
|
||||
return b.create<arith::XOrIOp>(loc, lhs, rhs);
|
||||
};
|
||||
|
||||
Value t, x, y, z;
|
||||
x = mul(ctr, key);
|
||||
y = x;
|
||||
z = add(y, key);
|
||||
x = add(mul(x, x), y);
|
||||
x = swapLoHi(x);
|
||||
x = add(mul(x, x), z);
|
||||
x = swapLoHi(x);
|
||||
x = add(mul(x, x), y);
|
||||
x = swapLoHi(x);
|
||||
t = x = add(mul(x, x), z);
|
||||
x = swapLoHi(x);
|
||||
return bitwiseXOr(t, shiftRight32(add(mul(x, x), y)));
|
||||
}
|
||||
|
||||
namespace {
|
||||
class ConvertAtenUniformOp : public OpConversionPattern<AtenUniformOp> {
|
||||
|
@ -82,30 +137,8 @@ public:
|
|||
return rewriter.notifyMatchFailure(
|
||||
op, "The generator has to ben None because only global default "
|
||||
"generator is supported");
|
||||
|
||||
// Build the core formula of LCG Algorithm that makes use of element index:
|
||||
// For output matrix with rank N:
|
||||
// temp1 = (cast(I64, index(D.0)) + seed) * multiplier + incrementStep
|
||||
// ...
|
||||
// tempN = (cast(I64, index(D.(N))) + tempN-1) * multiplier + incr
|
||||
// Refer to https://reviews.llvm.org/D101364.
|
||||
// The value of multiplier and incrementStep are referenced from
|
||||
// https://en.wikipedia.org/wiki/Linear_congruential_generator for 2^64.
|
||||
Value multiplier = rewriter.create<arith::ConstantOp>(
|
||||
loc, rewriter.getI64IntegerAttr(6364136223846793005));
|
||||
Value incrementStep = rewriter.create<arith::ConstantOp>(
|
||||
loc, rewriter.getI64IntegerAttr(1442695040888963407));
|
||||
// Tn = (index + Tn-1) * multiplier + incrementStep
|
||||
auto getNextTemp = [&](OpBuilder &b, Value index, Value temp) {
|
||||
Value castIndex =
|
||||
b.create<arith::IndexCastOp>(loc, b.getI64Type(), index);
|
||||
Value add = b.create<arith::AddIOp>(loc, castIndex, temp);
|
||||
Value mult = b.create<arith::MulIOp>(loc, add, multiplier);
|
||||
return b.create<arith::AddIOp>(loc, mult, incrementStep);
|
||||
};
|
||||
|
||||
// Get initial seed, min and max used by `linalg.generic` compute payload.
|
||||
Value initialSeed = rewriter.create<TorchConversion::GetNextSeedOp>(loc);
|
||||
// Get key, min and max used by `linalg.generic` compute payload.
|
||||
Value key = rewriter.create<TorchConversion::GetNextSeedOp>(loc);
|
||||
Value min = convertScalarToDtype(rewriter, loc, from, elemTy);
|
||||
Value max = convertScalarToDtype(rewriter, loc, to, elemTy);
|
||||
|
||||
|
@ -116,6 +149,8 @@ public:
|
|||
SmallVector<utils::IteratorType> iteratorTypes(
|
||||
resultRank, utils::IteratorType::parallel);
|
||||
SmallVector<Value> sizes = getTensorSizes(rewriter, loc, self);
|
||||
SmallVector<Value> sizesIntValues =
|
||||
castIndexVectorToInt64Vector(rewriter, loc, sizes);
|
||||
Value initTensor =
|
||||
rewriter.create<tensor::EmptyOp>(loc, getAsOpFoldResult(sizes), elemTy);
|
||||
Value uniformRes =
|
||||
|
@ -124,11 +159,16 @@ public:
|
|||
loc, initTensor.getType(), /*inputs=*/ValueRange{},
|
||||
/*outputs=*/initTensor, indexingMaps, iteratorTypes,
|
||||
[&](OpBuilder &b, Location loc, ValueRange args) {
|
||||
Value temp = initialSeed;
|
||||
SmallVector<Value> indicesIntValues;
|
||||
for (int i = 0; i < resultRank; i++) {
|
||||
Value index = b.create<linalg::IndexOp>(loc, i);
|
||||
temp = getNextTemp(b, index, temp);
|
||||
indicesIntValues.push_back(castIndexToInt64(
|
||||
b, loc, b.create<linalg::IndexOp>(loc, i)));
|
||||
}
|
||||
|
||||
Value linearIndex =
|
||||
toLinearIndex(b, loc, indicesIntValues, sizesIntValues);
|
||||
Value randomVal = randomUniformUInt(b, loc, linearIndex, key);
|
||||
|
||||
// scale = (max - min) * const(F64, 5.4210108E-20)
|
||||
// which is derived from rand(min,max) =
|
||||
// rand()/(RAND_MAX/(max-min)) where RAND_MAX = 2^64 - 1
|
||||
|
@ -139,7 +179,7 @@ public:
|
|||
|
||||
// res = cast(F64, tempN) * scale + min
|
||||
Value updateFloat =
|
||||
b.create<arith::UIToFPOp>(loc, elemTy, temp);
|
||||
b.create<arith::UIToFPOp>(loc, elemTy, randomVal);
|
||||
Value updateScaled =
|
||||
b.create<arith::MulFOp>(loc, updateFloat, scale);
|
||||
Value res = b.create<arith::AddFOp>(loc, updateScaled, min);
|
||||
|
|
|
@ -38,9 +38,62 @@ class UniformModule(torch.nn.Module):
|
|||
@register_test_case(module_factory=lambda: UniformModule())
|
||||
def UniformModule_basic(module, tu: TestUtils):
|
||||
module.forward(
|
||||
tu.rand(256, 512, 8).double(),
|
||||
tu.rand(512, 1024, 4).double(),
|
||||
tu.rand(512, 256, 4).double())
|
||||
tu.rand(256, 512, 12).double(),
|
||||
tu.rand(512, 1024, 12).double(),
|
||||
tu.rand(512, 256, 12).double())
|
||||
|
||||
# ==============================================================================
|
||||
|
||||
class UniformNoCorrelationModule(torch.nn.Module):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def correlation(self, stack):
|
||||
"""Calculate the correlation of the two rows in `stack`.
|
||||
|
||||
TODO: Remove this by adding support for `torch.corrcoef`.
|
||||
"""
|
||||
m = stack - torch.mean(stack, dim=1, keepdim=True)
|
||||
cov = torch.matmul(m, m.t()) / (stack.size()[1] - 1)
|
||||
return cov[0, 1] / torch.sqrt(cov[0, 0] * cov[1, 1])
|
||||
|
||||
@export
|
||||
@annotate_args([
|
||||
None,
|
||||
([1000], torch.float64, True),
|
||||
])
|
||||
def forward(self, x):
|
||||
# Correlation of two independent uniforms
|
||||
a = torch.ops.aten.uniform(x)
|
||||
b = torch.ops.aten.uniform(x)
|
||||
stack = torch.cat((a.unsqueeze(0), b.unsqueeze(0)))
|
||||
corr_a_b = self.correlation(stack)
|
||||
|
||||
# Split first dimension of large buffer
|
||||
larger = torch.empty((2,) + x.size(), dtype=torch.float64)
|
||||
larger = torch.ops.aten.uniform(larger)
|
||||
corr_major = self.correlation(larger)
|
||||
|
||||
# Split second dimension of large buffer
|
||||
corr_minor = self.correlation(larger.t().reshape(2, -1))
|
||||
|
||||
# This is hacky. The problem with returning just the correlations
|
||||
# is that `torch.allclose` becomes stricter the closer values are to
|
||||
# zero. Since the correlations are on the order of 1E-3, `rtol=1E-3`,
|
||||
# and `atol=1E-7`, the correlations have to have a difference smaller
|
||||
# than `atol + rtol * correlation = 1E-6`, which is too strict.
|
||||
# Instead, the correlations are explicitly required to be less than
|
||||
# 0.001.
|
||||
return torch.where(torch.abs(corr_a_b) < 0.001, 1, 2), \
|
||||
torch.where(torch.abs(corr_major) < 0.001, 1, 2), \
|
||||
torch.where(torch.abs(corr_minor) < 0.001, 1, 2)
|
||||
|
||||
|
||||
@register_test_case(module_factory=lambda: UniformNoCorrelationModule())
|
||||
def UniformNoCorrelationModule_basic(module, tu: TestUtils):
|
||||
module.forward(
|
||||
tu.rand(1000).double())
|
||||
|
||||
# ==============================================================================
|
||||
|
||||
|
@ -132,8 +185,8 @@ class BernoulliFloatModule(torch.nn.Module):
|
|||
@register_test_case(module_factory=lambda: BernoulliFloatModule())
|
||||
def BernoulliFloatModule_basic(module, tu: TestUtils):
|
||||
module.forward(
|
||||
tu.rand(512, 512, 10).double(),
|
||||
tu.rand(512, 512, 10).double())
|
||||
tu.rand(512, 512, 16).double(),
|
||||
tu.rand(512, 512, 16).double())
|
||||
|
||||
# ==============================================================================
|
||||
|
||||
|
|
Loading…
Reference in New Issue