mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2024-12-13 20:32:21 +00:00
ARM: fold prologue/epilogue sp updates into push/pop for code size
ARM prologues usually look like: push {r7, lr} sub sp, sp, #4 If code size is extremely important, this can be optimised to the single instruction: push {r6, r7, lr} where we don't actually care about the contents of r6, but pushing it subtracts 4 from sp as a side effect. This should implement such a conversion, predicated on the "minsize" function attribute (-Oz) since I've yet to find any code it actually makes faster. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@194264 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
2b01682aa7
commit
323ac85d6a
@ -1857,6 +1857,103 @@ void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB,
|
||||
}
|
||||
}
|
||||
|
||||
bool llvm::tryFoldSPUpdateIntoPushPop(MachineFunction &MF,
|
||||
MachineInstr *MI,
|
||||
unsigned NumBytes) {
|
||||
// This optimisation potentially adds lots of load and store
|
||||
// micro-operations, it's only really a great benefit to code-size.
|
||||
if (!MF.getFunction()->hasFnAttribute(Attribute::MinSize))
|
||||
return false;
|
||||
|
||||
// If only one register is pushed/popped, LLVM can use an LDR/STR
|
||||
// instead. We can't modify those so make sure we're dealing with an
|
||||
// instruction we understand.
|
||||
bool IsPop = isPopOpcode(MI->getOpcode());
|
||||
bool IsPush = isPushOpcode(MI->getOpcode());
|
||||
if (!IsPush && !IsPop)
|
||||
return false;
|
||||
|
||||
bool IsVFPPushPop = MI->getOpcode() == ARM::VSTMDDB_UPD ||
|
||||
MI->getOpcode() == ARM::VLDMDIA_UPD;
|
||||
bool IsT1PushPop = MI->getOpcode() == ARM::tPUSH ||
|
||||
MI->getOpcode() == ARM::tPOP ||
|
||||
MI->getOpcode() == ARM::tPOP_RET;
|
||||
|
||||
assert((IsT1PushPop || (MI->getOperand(0).getReg() == ARM::SP &&
|
||||
MI->getOperand(1).getReg() == ARM::SP)) &&
|
||||
"trying to fold sp update into non-sp-updating push/pop");
|
||||
|
||||
// The VFP push & pop act on D-registers, so we can only fold an adjustment
|
||||
// by a multiple of 8 bytes in correctly. Similarly rN is 4-bytes. Don't try
|
||||
// if this is violated.
|
||||
if (NumBytes % (IsVFPPushPop ? 8 : 4) != 0)
|
||||
return false;
|
||||
|
||||
// ARM and Thumb2 push/pop insts have explicit "sp, sp" operands (+
|
||||
// pred) so the list starts at 4. Thumb1 starts after the predicate.
|
||||
int RegListIdx = IsT1PushPop ? 2 : 4;
|
||||
|
||||
// Calculate the space we'll need in terms of registers.
|
||||
unsigned FirstReg = MI->getOperand(RegListIdx).getReg();
|
||||
unsigned RD0Reg, RegsNeeded;
|
||||
if (IsVFPPushPop) {
|
||||
RD0Reg = ARM::D0;
|
||||
RegsNeeded = NumBytes / 8;
|
||||
} else {
|
||||
RD0Reg = ARM::R0;
|
||||
RegsNeeded = NumBytes / 4;
|
||||
}
|
||||
|
||||
// We're going to have to strip all list operands off before
|
||||
// re-adding them since the order matters, so save the existing ones
|
||||
// for later.
|
||||
SmallVector<MachineOperand, 4> RegList;
|
||||
for (int i = MI->getNumOperands() - 1; i >= RegListIdx; --i)
|
||||
RegList.push_back(MI->getOperand(i));
|
||||
|
||||
MachineBasicBlock *MBB = MI->getParent();
|
||||
const TargetRegisterInfo *TRI = MF.getRegInfo().getTargetRegisterInfo();
|
||||
|
||||
// Now try to find enough space in the reglist to allocate NumBytes.
|
||||
for (unsigned CurReg = FirstReg - 1; CurReg >= RD0Reg && RegsNeeded;
|
||||
--CurReg, --RegsNeeded) {
|
||||
if (!IsPop) {
|
||||
// Pushing any register is completely harmless, mark the
|
||||
// register involved as undef since we don't care about it in
|
||||
// the slightest.
|
||||
RegList.push_back(MachineOperand::CreateReg(CurReg, false, false,
|
||||
false, false, true));
|
||||
continue;
|
||||
}
|
||||
|
||||
// However, we can only pop an extra register if it's not live. Otherwise we
|
||||
// might clobber a return value register. We assume that once we find a live
|
||||
// return register all lower ones will be too so there's no use proceeding.
|
||||
if (MBB->computeRegisterLiveness(TRI, CurReg, MI) !=
|
||||
MachineBasicBlock::LQR_Dead)
|
||||
return false;
|
||||
|
||||
// Mark the unimportant registers as <def,dead> in the POP.
|
||||
RegList.push_back(MachineOperand::CreateReg(CurReg, true, false, true));
|
||||
}
|
||||
|
||||
if (RegsNeeded > 0)
|
||||
return false;
|
||||
|
||||
// Finally we know we can profitably perform the optimisation so go
|
||||
// ahead: strip all existing registers off and add them back again
|
||||
// in the right order.
|
||||
for (int i = MI->getNumOperands() - 1; i >= RegListIdx; --i)
|
||||
MI->RemoveOperand(i);
|
||||
|
||||
// Add the complete list back in.
|
||||
MachineInstrBuilder MIB(MF, &*MI);
|
||||
for (int i = RegList.size() - 1; i >= 0; --i)
|
||||
MIB.addOperand(RegList[i]);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
|
||||
unsigned FrameReg, int &Offset,
|
||||
const ARMBaseInstrInfo &TII) {
|
||||
|
@ -362,6 +362,17 @@ bool isIndirectBranchOpcode(int Opc) {
|
||||
return Opc == ARM::BX || Opc == ARM::MOVPCRX || Opc == ARM::tBRIND;
|
||||
}
|
||||
|
||||
static inline bool isPopOpcode(int Opc) {
|
||||
return Opc == ARM::tPOP_RET || Opc == ARM::LDMIA_RET ||
|
||||
Opc == ARM::t2LDMIA_RET || Opc == ARM::tPOP || Opc == ARM::LDMIA_UPD ||
|
||||
Opc == ARM::t2LDMIA_UPD || Opc == ARM::VLDMDIA_UPD;
|
||||
}
|
||||
|
||||
static inline bool isPushOpcode(int Opc) {
|
||||
return Opc == ARM::tPUSH || Opc == ARM::t2STMDB_UPD ||
|
||||
Opc == ARM::STMDB_UPD || Opc == ARM::VSTMDDB_UPD;
|
||||
}
|
||||
|
||||
/// getInstrPredicate - If instruction is predicated, returns its predicate
|
||||
/// condition, otherwise returns AL. It also returns the condition code
|
||||
/// register by reference.
|
||||
@ -401,6 +412,13 @@ void emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
|
||||
const ARMBaseRegisterInfo& MRI,
|
||||
unsigned MIFlags = 0);
|
||||
|
||||
/// Tries to add registers to the reglist of a given base-updating
|
||||
/// push/pop instruction to adjust the stack by an additional
|
||||
/// NumBytes. This can save a few bytes per function in code-size, but
|
||||
/// obviously generates more memory traffic. As such, it only takes
|
||||
/// effect in functions being optimised for size.
|
||||
bool tryFoldSPUpdateIntoPushPop(MachineFunction &MF, MachineInstr *MI,
|
||||
unsigned NumBytes);
|
||||
|
||||
/// rewriteARMFrameIndex / rewriteT2FrameIndex -
|
||||
/// Rewrite MI to access 'Offset' bytes from the FP. Return false if the
|
||||
|
@ -93,11 +93,7 @@ static bool isCSRestore(MachineInstr *MI,
|
||||
const ARMBaseInstrInfo &TII,
|
||||
const uint16_t *CSRegs) {
|
||||
// Integer spill area is handled with "pop".
|
||||
if (MI->getOpcode() == ARM::LDMIA_RET ||
|
||||
MI->getOpcode() == ARM::t2LDMIA_RET ||
|
||||
MI->getOpcode() == ARM::LDMIA_UPD ||
|
||||
MI->getOpcode() == ARM::t2LDMIA_UPD ||
|
||||
MI->getOpcode() == ARM::VLDMDIA_UPD) {
|
||||
if (isPopOpcode(MI->getOpcode())) {
|
||||
// The first two operands are predicates. The last two are
|
||||
// imp-def and imp-use of SP. Check everything in between.
|
||||
for (int i = 5, e = MI->getNumOperands(); i != e; ++i)
|
||||
@ -221,42 +217,37 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
|
||||
}
|
||||
|
||||
// Move past area 1.
|
||||
if (GPRCS1Size > 0) MBBI++;
|
||||
MachineBasicBlock::iterator LastPush = MBB.end(), FramePtrPush;
|
||||
if (GPRCS1Size > 0)
|
||||
FramePtrPush = LastPush = MBBI++;
|
||||
|
||||
// Determine starting offsets of spill areas.
|
||||
bool HasFP = hasFP(MF);
|
||||
unsigned DPRCSOffset = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize);
|
||||
unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize;
|
||||
unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size;
|
||||
if (HasFP)
|
||||
int FramePtrOffsetInPush = 0;
|
||||
if (HasFP) {
|
||||
FramePtrOffsetInPush = MFI->getObjectOffset(FramePtrSpillFI) + GPRCS1Size;
|
||||
AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) +
|
||||
NumBytes);
|
||||
}
|
||||
AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset);
|
||||
AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset);
|
||||
AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
|
||||
|
||||
// Set FP to point to the stack slot that contains the previous FP.
|
||||
// For iOS, FP is R7, which has now been stored in spill area 1.
|
||||
// Otherwise, if this is not iOS, all the callee-saved registers go
|
||||
// into spill area 1, including the FP in R11. In either case, it is
|
||||
// now safe to emit this assignment.
|
||||
if (HasFP) {
|
||||
int FramePtrOffset = MFI->getObjectOffset(FramePtrSpillFI) + GPRCS1Size;
|
||||
emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, MBBI, dl, TII,
|
||||
FramePtr, ARM::SP, FramePtrOffset,
|
||||
MachineInstr::FrameSetup);
|
||||
}
|
||||
|
||||
// Move past area 2.
|
||||
if (GPRCS2Size > 0) MBBI++;
|
||||
if (GPRCS2Size > 0) {
|
||||
LastPush = MBBI++;
|
||||
}
|
||||
|
||||
// Move past area 3.
|
||||
if (DPRCSSize > 0) {
|
||||
MBBI++;
|
||||
LastPush = MBBI++;
|
||||
// Since vpush register list cannot have gaps, there may be multiple vpush
|
||||
// instructions in the prologue.
|
||||
while (MBBI->getOpcode() == ARM::VSTMDDB_UPD)
|
||||
MBBI++;
|
||||
LastPush = MBBI++;
|
||||
}
|
||||
|
||||
// Move past the aligned DPRCS2 area.
|
||||
@ -272,8 +263,12 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
|
||||
|
||||
if (NumBytes) {
|
||||
// Adjust SP after all the callee-save spills.
|
||||
emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes,
|
||||
MachineInstr::FrameSetup);
|
||||
if (tryFoldSPUpdateIntoPushPop(MF, LastPush, NumBytes))
|
||||
FramePtrOffsetInPush += NumBytes;
|
||||
else
|
||||
emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes,
|
||||
MachineInstr::FrameSetup);
|
||||
|
||||
if (HasFP && isARM)
|
||||
// Restore from fp only in ARM mode: e.g. sub sp, r7, #24
|
||||
// Note it's not safe to do this in Thumb2 mode because it would have
|
||||
@ -286,6 +281,18 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
|
||||
AFI->setShouldRestoreSPFromFP(true);
|
||||
}
|
||||
|
||||
// Set FP to point to the stack slot that contains the previous FP.
|
||||
// For iOS, FP is R7, which has now been stored in spill area 1.
|
||||
// Otherwise, if this is not iOS, all the callee-saved registers go
|
||||
// into spill area 1, including the FP in R11. In either case, it
|
||||
// is in area one and the adjustment needs to take place just after
|
||||
// that push.
|
||||
if (HasFP)
|
||||
emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, ++FramePtrPush, dl, TII,
|
||||
FramePtr, ARM::SP, FramePtrOffsetInPush,
|
||||
MachineInstr::FrameSetup);
|
||||
|
||||
|
||||
if (STI.isTargetELF() && hasFP(MF))
|
||||
MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() -
|
||||
AFI->getFramePtrSpillOffset());
|
||||
@ -380,12 +387,17 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
|
||||
if (NumBytes != 0)
|
||||
emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
|
||||
} else {
|
||||
MachineBasicBlock::iterator FirstPop = MBBI;
|
||||
|
||||
// Unwind MBBI to point to first LDR / VLDRD.
|
||||
const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
|
||||
if (MBBI != MBB.begin()) {
|
||||
do
|
||||
do {
|
||||
if (isPopOpcode(MBBI->getOpcode()))
|
||||
FirstPop = MBBI;
|
||||
|
||||
--MBBI;
|
||||
while (MBBI != MBB.begin() && isCSRestore(MBBI, TII, CSRegs));
|
||||
} while (MBBI != MBB.begin() && isCSRestore(MBBI, TII, CSRegs));
|
||||
if (!isCSRestore(MBBI, TII, CSRegs))
|
||||
++MBBI;
|
||||
}
|
||||
@ -429,8 +441,8 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
|
||||
ARM::SP)
|
||||
.addReg(FramePtr));
|
||||
}
|
||||
} else if (NumBytes)
|
||||
emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
|
||||
} else if (NumBytes && !tryFoldSPUpdateIntoPushPop(MF, FirstPop, NumBytes))
|
||||
emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
|
||||
|
||||
// Increment past our save areas.
|
||||
if (AFI->getDPRCalleeSavedAreaSize()) {
|
||||
|
@ -164,11 +164,17 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
|
||||
AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
|
||||
NumBytes = DPRCSOffset;
|
||||
|
||||
int FramePtrOffsetInBlock = 0;
|
||||
if (tryFoldSPUpdateIntoPushPop(MF, prior(MBBI), NumBytes)) {
|
||||
FramePtrOffsetInBlock = NumBytes;
|
||||
NumBytes = 0;
|
||||
}
|
||||
|
||||
// Adjust FP so it point to the stack slot that contains the previous FP.
|
||||
if (HasFP) {
|
||||
int FramePtrOffset = MFI->getObjectOffset(FramePtrSpillFI) + GPRCS1Size;
|
||||
FramePtrOffsetInBlock += MFI->getObjectOffset(FramePtrSpillFI) + GPRCS1Size;
|
||||
AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr)
|
||||
.addReg(ARM::SP).addImm(FramePtrOffset / 4)
|
||||
.addReg(ARM::SP).addImm(FramePtrOffsetInBlock / 4)
|
||||
.setMIFlags(MachineInstr::FrameSetup));
|
||||
if (NumBytes > 508)
|
||||
// If offset is > 508 then sp cannot be adjusted in a single instruction,
|
||||
@ -292,8 +298,9 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
|
||||
&MBB.front() != MBBI &&
|
||||
prior(MBBI)->getOpcode() == ARM::tPOP) {
|
||||
MachineBasicBlock::iterator PMBBI = prior(MBBI);
|
||||
emitSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes);
|
||||
} else
|
||||
if (!tryFoldSPUpdateIntoPushPop(MF, PMBBI, NumBytes))
|
||||
emitSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes);
|
||||
} else if (!tryFoldSPUpdateIntoPushPop(MF, MBBI, NumBytes))
|
||||
emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes);
|
||||
}
|
||||
}
|
||||
|
126
test/CodeGen/ARM/fold-stack-adjust.ll
Normal file
126
test/CodeGen/ARM/fold-stack-adjust.ll
Normal file
@ -0,0 +1,126 @@
|
||||
; RUN: llc -mtriple=thumbv7-apple-darwin-eabi < %s | FileCheck %s
|
||||
; RUN: llc -mtriple=thumbv6m-apple-darwin-eabi -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-T1
|
||||
; RUN: llc -mtriple=thumbv7-apple-darwin-ios -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-IOS
|
||||
|
||||
|
||||
declare void @bar(i8*)
|
||||
|
||||
%bigVec = type [2 x double]
|
||||
|
||||
@var = global %bigVec zeroinitializer
|
||||
|
||||
define void @check_simple() minsize {
|
||||
; CHECK-LABEL: check_simple:
|
||||
; CHECK: push.w {r7, r8, r9, r10, r11, lr}
|
||||
; CHECK-NOT: sub sp, sp,
|
||||
; ...
|
||||
; CHECK-NOT: add sp, sp,
|
||||
; CHECK: pop.w {r7, r8, r9, r10, r11, pc}
|
||||
|
||||
; CHECK-T1-LABEL: check_simple:
|
||||
; CHECK-T1: push {r3, r4, r5, r6, r7, lr}
|
||||
; CHECK-T1: add r7, sp, #16
|
||||
; CHECK-T1-NOT: sub sp, sp,
|
||||
; ...
|
||||
; CHECK-T1-NOT: add sp, sp,
|
||||
; CHECK-T1: pop {r3, r4, r5, r6, r7, pc}
|
||||
|
||||
; iOS always has a frame pointer and messing with the push affects
|
||||
; how it's set in the prologue. Make sure we get that right.
|
||||
; CHECK-IOS-LABEL: check_simple:
|
||||
; CHECK-IOS: push {r3, r4, r5, r6, r7, lr}
|
||||
; CHECK-NOT: sub sp,
|
||||
; CHECK-IOS: add r7, sp, #16
|
||||
; CHECK-NOT: sub sp,
|
||||
; ...
|
||||
; CHECK-NOT: add sp,
|
||||
; CHEC: pop {r3, r4, r5, r6, r7, pc}
|
||||
|
||||
%var = alloca i8, i32 16
|
||||
call void @bar(i8* %var)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @check_simple_too_big() minsize {
|
||||
; CHECK-LABEL: check_simple_too_big:
|
||||
; CHECK: push.w {r11, lr}
|
||||
; CHECK: sub sp,
|
||||
; ...
|
||||
; CHECK: add sp,
|
||||
; CHECK: pop.w {r11, pc}
|
||||
%var = alloca i8, i32 64
|
||||
call void @bar(i8* %var)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @check_vfp_fold() minsize {
|
||||
; CHECK-LABEL: check_vfp_fold:
|
||||
; CHECK: push {r[[GLOBREG:[0-9]+]], lr}
|
||||
; CHECK: vpush {d6, d7, d8, d9}
|
||||
; CHECK-NOT: sub sp,
|
||||
; ...
|
||||
; CHECK: vldmia r[[GLOBREG]], {d8, d9}
|
||||
; ...
|
||||
; CHECK-NOT: add sp,
|
||||
; CHECK: vpop {d6, d7, d8, d9}
|
||||
; CHECKL pop {r[[GLOBREG]], pc}
|
||||
|
||||
; iOS uses aligned NEON stores here, which is convenient since we
|
||||
; want to make sure that works too.
|
||||
; CHECK-IOS-LABEL: check_vfp_fold:
|
||||
; CHECK-IOS: push {r0, r1, r2, r3, r4, r7, lr}
|
||||
; CHECK-IOS: sub.w r4, sp, #16
|
||||
; CHECK-IOS: bic r4, r4, #15
|
||||
; CHECK-IOS: mov sp, r4
|
||||
; CHECK-IOS: vst1.64 {d8, d9}, [r4:128]
|
||||
; ...
|
||||
; CHECK-IOS: add r4, sp, #16
|
||||
; CHECK-IOS: vld1.64 {d8, d9}, [r4:128]
|
||||
; CHECK-IOS: mov sp, r4
|
||||
; CHECK-IOS: pop {r4, r7, pc}
|
||||
|
||||
%var = alloca i8, i32 16
|
||||
|
||||
%tmp = load %bigVec* @var
|
||||
call void @bar(i8* %var)
|
||||
store %bigVec %tmp, %bigVec* @var
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; This function should use just enough space that the "add sp, sp, ..." could be
|
||||
; folded in except that doing so would clobber the value being returned.
|
||||
define i64 @check_no_return_clobber() minsize {
|
||||
; CHECK-LABEL: check_no_return_clobber:
|
||||
; CHECK: push.w {r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
||||
; CHECK-NOT: sub sp,
|
||||
; ...
|
||||
; CHECK: add sp, #40
|
||||
; CHECK: pop.w {r11, pc}
|
||||
|
||||
; Just to keep iOS FileCheck within previous function:
|
||||
; CHECK-IOS-LABEL: check_no_return_clobber:
|
||||
|
||||
%var = alloca i8, i32 40
|
||||
call void @bar(i8* %var)
|
||||
ret i64 0
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc double @check_vfp_no_return_clobber() minsize {
|
||||
; CHECK-LABEL: check_vfp_no_return_clobber:
|
||||
; CHECK: push {r[[GLOBREG:[0-9]+]], lr}
|
||||
; CHECK: vpush {d0, d1, d2, d3, d4, d5, d6, d7, d8, d9}
|
||||
; CHECK-NOT: sub sp,
|
||||
; ...
|
||||
; CHECK: add sp, #64
|
||||
; CHECK: vpop {d8, d9}
|
||||
; CHECK: pop {r[[GLOBREG]], pc}
|
||||
|
||||
%var = alloca i8, i32 64
|
||||
|
||||
%tmp = load %bigVec* @var
|
||||
call void @bar(i8* %var)
|
||||
store %bigVec %tmp, %bigVec* @var
|
||||
|
||||
ret double 1.0
|
||||
}
|
Loading…
Reference in New Issue
Block a user