/*
 * tclAllocZippy.c --
 *
 *      This is a very flexible storage allocator for Tcl, for use with or
 *      without threads.
 *
 *      It is essentially the ex-tclThreadAlloc, aolserver's fast threaded
 *      allocator. Mods with respect to the original:
 *            - it is split into two: the freeObj list is part of Tcl itself,
 *              in tclAlloc.c, and the malloc() part is here
 *            - split blocks in the shared pool before mallocing again for
 *              improved cache usage
 *            - stats and Tcl_GetMemoryInfo are gone
 *            - adapt for unthreaded usage as replacement of the ex tclAlloc
 *            - (TODO!) build zippy as a pre-loadable library to use with a
 *              native build as a malloc replacement. Difficulty is to make it
 *              portable (easy enough on modern elf/unix, to be researched on
 *              win and mac . This would be the best option, instead of
 *              MULTI. It could be built in two versions (perf, debug/stats)
 *
 * The Initial Developer of the Original Code is America Online, Inc.
 * Portions created by AOL are Copyright (C) 1999 America Online, Inc.
 *
 * Copyright (c) 2008-2013 by Miguel Sofer. All rights reserved.
 *
 * See the file "license.terms" for information on usage and redistribution of
 * this file, and for a DISCLAIMER OF ALL WARRANTIES.
 */

#include "tclInt.h"
#include "tclAlloc.h"

#undef TclpAlloc
#undef TclpRealloc
#undef TclpFree

/*
 * The following struct stores accounting information for each block including
 * two small magic numbers and a bucket number when in use or a next pointer
 * when free. The original requested size (not including the Block overhead)
 * is also maintained.
 */

/*
 * The following union stores accounting information for each block including
 * two small magic numbers and a bucket number when in use or a next pointer
 * when free. The original requested size (not including the Block overhead)
 * is also maintained.
 */

typedef struct Block {
    union {
	struct Block *next;		/* Next in free list. */
	struct {
	    unsigned char magic1;	/* First magic number. */
	    unsigned char bucket;	/* Bucket block allocated from. */
	    unsigned char inUse;	/* Block memory currently in use, used
					 * by realloc. */
	    unsigned char magic2;	/* Second magic number. */
	} s;
    } u;
} Block;

#define OFFSET      ALIGN(sizeof(Block))

#define nextBlock	u.next
#define sourceBucket	u.s.bucket
#define magicNum1	u.s.magic1
#define magicNum2	u.s.magic2
#define used	        u.s.inUse
#define MAGIC		0xEF

/*
 * The following defines the minimum and maximum block sizes and the number
 * of buckets in the bucket cache.
 *                        32b    64b    Apple-32b(?)
 *     ALLOCALIGN           8     16       16
 *     sizeof(Block)        8     16       16
 *     OFFSET               8     16       16
 *     MINALLOC            16     32       32
 *     NBUCKETS            11     10       10
 *     MAXALLOC         16384  16384    16384
 *     small allocs      1024    512     1024
 *        at a time
 */

#define MINALLOC    ALIGN(OFFSET+8)
#define NBUCKETS    (11 - (MINALLOC >> 5))
#define MAXALLOC    (MINALLOC << (NBUCKETS - 1))

/*
 * The following structure defines a bucket of blocks, optionally with various
 * accounting and statistics information.
 */

typedef struct Bucket {
    Block *firstPtr;		/* First block available */
    long numFree;		/* Number of blocks available */
} Bucket;

/*
 * The following array specifies various per-bucket limits and locks. The
 * values are statically initialized to avoid calculating them repeatedly.
 */

static struct {
    size_t blockSize;		/* Bucket blocksize. */
    int shift;
    int maxBlocks;		/* Max blocks before move to share. */
    int numMove;		/* Num blocks to move to share. */
    Tcl_Mutex *lockPtr;		/* Share bucket lock. */
} bucketInfo[NBUCKETS];

/*
 * The Tcl_Obj per-thread cache, used by aNATIVE, aZIPPY and aMULTI.
 */

typedef struct Cache {
    Bucket buckets[NBUCKETS];	/* The buckets for this thread */
} Cache;

static Cache sharedCache;
#define sharedPtr (&sharedCache)

static void InitBucketInfo(void);
static inline char * Block2Ptr(Block *blockPtr,
	int bucket, unsigned int reqSize);
static inline Block * Ptr2Block(char *ptr);

static int  GetBlocks(Cache *cachePtr, int bucket);

static void PutBlocks(Cache *cachePtr, int bucket, int numMove);
static inline void LockBucket(int bucket);
static inline void UnlockBucket(int bucket);


/*
 *-------------------------------------------------------------------------
 *
 * TclXpInitAlloc --
 *
 *	Initialize the memory system.
 *
 * Results:
 *	None.
 *
 * Side effects:
 *	Initialize the mutex used to serialize allocations.
 *
 *-------------------------------------------------------------------------
 */

static void
InitBucketInfo ()
{
    int i;
    int shift = 0;

    for (i = 0; i < NBUCKETS; ++i) {
	bucketInfo[i].blockSize = MINALLOC << i;
	while (((bucketInfo[i].blockSize -OFFSET) >> shift) > 255) {
	    ++shift;
	}
	bucketInfo[i].shift = shift;

	if (TCL_THREADED) {
	    /* TODO: clearer logic? Change move to keep? */
	    bucketInfo[i].maxBlocks = 1 << (NBUCKETS - 1 - i);
	    bucketInfo[i].numMove = i < NBUCKETS - 1 ?
		1 << (NBUCKETS - 2 - i) : 1;
	    bucketInfo[i].lockPtr = TclpNewAllocMutex();
	}
    }
}

void
TclXpInitAlloc(void)
{
    /*
     * Set the params for the correct allocator
     */

    if (TCL_THREADED) {
	InitBucketInfo();
	TclSetSharedAllocCache(sharedPtr);
    }
}

/*
 *----------------------------------------------------------------------
 *
 * TclXpFinalizeAlloc --
 *
 *	This procedure is used to destroy all private resources used in this
 *	file.
 *
 * Results:
 *	None.
 *
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

void
TclXpFinalizeAlloc(void)
{

    if (TCL_THREADED) {
	unsigned int i;

	for (i = 0; i < NBUCKETS; ++i) {
	    TclpFreeAllocMutex(bucketInfo[i].lockPtr);
	    bucketInfo[i].lockPtr = NULL;
	}
    }
}

/*
 *----------------------------------------------------------------------
 *
 * TclXpFreeAllocCache --
 *
 *	Flush and delete a cache
 *
 * Results:
 *	None.
 *
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

void
TclXpFreeAllocCache(
    void *arg)
{
    Cache *cachePtr = arg;

    register unsigned int bucket;

    /*
     * Flush blocks.
     */

    for (bucket = 0; bucket < NBUCKETS; ++bucket) {
	if (cachePtr->buckets[bucket].numFree > 0) {
	    PutBlocks(cachePtr, bucket, cachePtr->buckets[bucket].numFree);
	}
    }
    free(cachePtr);
}

/*
 *----------------------------------------------------------------------
 *
 * Block2Ptr, Ptr2Block --
 *
 *	Convert between internal blocks and user pointers.
 *
 * Results:
 *	User pointer or internal block.
 *
 * Side effects:
 *	Invalid blocks will abort the server.
 *
 *----------------------------------------------------------------------
 */

static inline char *
Block2Ptr(
    Block *blockPtr,
    int bucket,
    unsigned int reqSize)
{
    register void *ptr;
    
    blockPtr->magicNum1 = blockPtr->magicNum2 = MAGIC;
    if (bucket == NBUCKETS) {
	blockPtr->used = 255;
    } else {
	blockPtr->used = (reqSize >> bucketInfo[bucket].shift);
    }
    blockPtr->sourceBucket = bucket;
    ptr = (void *) (((char *)blockPtr) + OFFSET);
    return (char *) ptr;
}

static inline Block *
Ptr2Block(
    char *ptr)
{
    register Block *blockPtr;

    blockPtr = (Block *) (((char *) ptr) - OFFSET);
    if (blockPtr->magicNum1 != MAGIC || blockPtr->magicNum2 != MAGIC) {
	Tcl_Panic("alloc: invalid block: %p: %x %x",
		blockPtr, blockPtr->magicNum1, blockPtr->magicNum2);
    }
    return blockPtr;
}

/*
 *----------------------------------------------------------------------
 *
 * TclpAlloc --
 *
 *	Allocate memory.
 *
 * Results:
 *	Pointer to memory just beyond Block pointer.
 *
 * Side effects:
 *	May allocate more blocks for a bucket.
 *
 *----------------------------------------------------------------------
 */

static inline Cache *
GetAllocCache(void)
{
    Cache *cachePtr = TclGetAllocCache();
    if (cachePtr == NULL) {
	cachePtr = calloc(1, sizeof(Cache));			
	if (cachePtr == NULL) {					
	    Tcl_Panic("alloc: could not allocate new cache");	
	}								
	TclSetAllocCache(cachePtr);					
    }
    return cachePtr;
}


char *
TclpAlloc(
    unsigned int reqSize)
{
    Cache *cachePtr;
    Block *blockPtr;
    register int bucket;
    size_t size;

    if (TCL_PURIFY) {
	return (void *) malloc(reqSize);
    }

    cachePtr = GetAllocCache();

#ifndef __LP64__
    if (sizeof(int) >= sizeof(size_t)) {
	/* An unsigned int overflow can also be a size_t overflow */
	const size_t zero = 0;
	const size_t max = ~zero;

	if (((size_t) reqSize) > max - OFFSET) {
	    /* Requested allocation exceeds memory */
	    return NULL;
	}
    }
#endif

    /*
     * Increment the requested size to include room for the Block structure.
     * Call malloc() directly if the required amount is greater than the
     * largest block, otherwise pop the smallest block large enough,
     * allocating more blocks if necessary.
     */

    size = reqSize + OFFSET;
    if (size > MAXALLOC) {
	bucket = NBUCKETS;
	blockPtr = malloc(size);
    } else {
	blockPtr = NULL;
	bucket = 0;
	while (bucketInfo[bucket].blockSize < size) {
	    bucket++;
	}
	if (cachePtr->buckets[bucket].numFree || GetBlocks(cachePtr, bucket)) {
	    blockPtr = cachePtr->buckets[bucket].firstPtr;
	    cachePtr->buckets[bucket].firstPtr = blockPtr->nextBlock;
	    cachePtr->buckets[bucket].numFree--;
	}
	if (blockPtr == NULL) {
	    return NULL;
	}
    }
    return Block2Ptr(blockPtr, bucket, reqSize);
}

/*
 *----------------------------------------------------------------------
 *
 * TclpFree --
 *
 *	Return blocks to the thread block cache.
 *
 * Results:
 *	None.
 *
 * Side effects:
 *	May move blocks to shared cache.
 *
 *----------------------------------------------------------------------
 */

void
TclpFree(
    char *ptr)
{
    Cache *cachePtr;
    Block *blockPtr;
    int bucket;

    if (TCL_PURIFY) {
	return free((char *) ptr);
    }
    
    if (ptr == NULL) {
	return;
    }

    blockPtr = Ptr2Block(ptr);
    
    /*
     * Get the block back from the user pointer and call system free directly
     * for large blocks. Otherwise, push the block back on the bucket and move
     * blocks to the shared cache if there are now too many free.
     */

    bucket = blockPtr->sourceBucket;
    if (bucket == NBUCKETS) {
	free(blockPtr);
	return;
    }

    cachePtr = GetAllocCache();

    blockPtr->nextBlock = cachePtr->buckets[bucket].firstPtr;
    cachePtr->buckets[bucket].firstPtr = blockPtr;
    cachePtr->buckets[bucket].numFree++;
    if (cachePtr != sharedPtr &&
	    cachePtr->buckets[bucket].numFree > bucketInfo[bucket].maxBlocks) {
	PutBlocks(cachePtr, bucket, bucketInfo[bucket].numMove);
    }
}

/*
 *----------------------------------------------------------------------
 *
 * TclpRealloc --
 *
 *	Re-allocate memory to a larger or smaller size.
 *
 * Results:
 *	Pointer to memory just beyond Block pointer.
 *
 * Side effects:
 *	Previous memory, if any, may be freed.
 *
 *----------------------------------------------------------------------
 */

char *
TclpRealloc(  
    char *ptr,
    unsigned int reqSize)
{
    Block *blockPtr;
    void *newPtr;
    size_t size, min;
    int bucket;

    if (TCL_PURIFY) {
	return (void *) realloc((char *) ptr, reqSize);
    }
    
    if (ptr == NULL) {
	return TclpAlloc(reqSize);
    }

#ifndef __LP64__
    if (sizeof(int) >= sizeof(size_t)) {
	/* An unsigned int overflow can also be a size_t overflow */
	const size_t zero = 0;
	const size_t max = ~zero;

	if (((size_t) reqSize) > max - OFFSET) {
	    /* Requested allocation exceeds memory */
	    return NULL;
	}
    }
#endif

    /*
     * If the block is not a system block and belongs in the same block,
     * simply return the existing pointer. Otherwise, if the block is a system
     * block and the new size would also require a system block, call
     * realloc() directly. 
     */

    blockPtr = Ptr2Block(ptr);
    
    size = reqSize + OFFSET;
    bucket = blockPtr->sourceBucket;
    if (bucket != NBUCKETS) {
	if (bucket > 0) {
	    min = bucketInfo[bucket-1].blockSize;
	} else {
	    min = 0;
	}
	if (size > min && size <= bucketInfo[bucket].blockSize) {
	    return Block2Ptr(blockPtr, bucket, reqSize);
	}
    } else if (size > MAXALLOC) {
	blockPtr = realloc(blockPtr, size);
	if (blockPtr == NULL) {
	    return NULL;
	}
	return Block2Ptr(blockPtr, NBUCKETS, reqSize);
    }

    /*
     * Finally, perform an expensive malloc/copy/free.
     */

    newPtr = TclpAlloc(reqSize);
    if (newPtr != NULL) {
	size_t maxSize = bucketInfo[bucket].blockSize - OFFSET;
	size_t toCopy = ((blockPtr->used + 1) << bucketInfo[bucket].shift);

	if (toCopy > maxSize) {
	    toCopy = maxSize;
	}
	if (toCopy > reqSize) {
	    toCopy = reqSize;
	}
	
	memcpy(newPtr, ptr, toCopy);
	TclpFree(ptr);
    }
    return newPtr;
}

/*
 *----------------------------------------------------------------------
 *
 * LockBucket, UnlockBucket --
 *
 *	Set/unset the lock to access a bucket in the shared cache.
 *
 * Results:
 *	None.
 *
 * Side effects:
 *	Lock activity and contention are monitored globally and on a per-cache
 *	basis.
 *
 *----------------------------------------------------------------------
 */

static inline void
LockBucket(
    int bucket)
{
    Tcl_MutexLock(bucketInfo[bucket].lockPtr);
}

static inline void
UnlockBucket(
    int bucket)
{
    Tcl_MutexUnlock(bucketInfo[bucket].lockPtr);
}

/*
 *----------------------------------------------------------------------
 *
 * PutBlocks --
 *
 *	Return unused blocks to the shared cache.
 *
 * Results:
 *	None.
 *
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

static void
PutBlocks(
    Cache *cachePtr,
    int bucket,
    int numMove)
{
    register Block *lastPtr, *firstPtr;
    register int n = numMove;

    /*
     * Before acquiring the lock, walk the block list to find the last block
     * to be moved.
     */

    firstPtr = lastPtr = cachePtr->buckets[bucket].firstPtr;
    while (--n > 0) {
	lastPtr = lastPtr->nextBlock;
    }
    cachePtr->buckets[bucket].firstPtr = lastPtr->nextBlock;
    cachePtr->buckets[bucket].numFree -= numMove;

    /*
     * Aquire the lock and place the list of blocks at the front of the shared
     * cache bucket.
     */

    LockBucket(bucket);
    lastPtr->nextBlock = sharedPtr->buckets[bucket].firstPtr;
    sharedPtr->buckets[bucket].firstPtr = firstPtr;
    sharedPtr->buckets[bucket].numFree += numMove;
    UnlockBucket(bucket);
}

/*
 *----------------------------------------------------------------------
 *
 * GetBlocks --
 *
 *	Get more blocks for a bucket.
 *
 * Results:
 *	1 if blocks where allocated, 0 otherwise.
 *
 * Side effects:
 *	Cache may be filled with available blocks.
 *
 *----------------------------------------------------------------------
 */

static int
GetBlocks(
    Cache *cachePtr,
    int bucket)
{
    register Block *blockPtr = NULL;
    register int n;

    /*
     * First, atttempt to move blocks from the shared cache. Note the
     * potentially dirty read of numFree before acquiring the lock which is a
     * slight performance enhancement. The value is verified after the lock is
     * actually acquired.
     */

    if (cachePtr != sharedPtr && sharedPtr->buckets[bucket].numFree > 0) {
	LockBucket(bucket);
	if (sharedPtr->buckets[bucket].numFree > 0) {

	    /*
	     * Either move the entire list or walk the list to find the last
	     * block to move.
	     */

	    n = bucketInfo[bucket].numMove;
	    if (n >= sharedPtr->buckets[bucket].numFree) {
		cachePtr->buckets[bucket].firstPtr =
			sharedPtr->buckets[bucket].firstPtr;
		cachePtr->buckets[bucket].numFree =
			sharedPtr->buckets[bucket].numFree;
		sharedPtr->buckets[bucket].firstPtr = NULL;
		sharedPtr->buckets[bucket].numFree = 0;
	    } else {
		blockPtr = sharedPtr->buckets[bucket].firstPtr;
		cachePtr->buckets[bucket].firstPtr = blockPtr;
		sharedPtr->buckets[bucket].numFree -= n;
		cachePtr->buckets[bucket].numFree = n;
		while (--n > 0) {
		    blockPtr = blockPtr->nextBlock;
		}
		sharedPtr->buckets[bucket].firstPtr = blockPtr->nextBlock;
		blockPtr->nextBlock = NULL;
	    }
	}
	UnlockBucket(bucket);
    }
    
    if (cachePtr->buckets[bucket].numFree == 0) {
	register size_t size;

	/*
	 * If no blocks could be moved from shared, first look for a larger
	 * block in this cache OR the shared cache to split up.
	 */
	
	n = NBUCKETS;
	size = 0; /* lint */
	while (--n > bucket) {
	    if (cachePtr->buckets[n].numFree > 0) {
		size = bucketInfo[n].blockSize;
		blockPtr = cachePtr->buckets[n].firstPtr;
		cachePtr->buckets[n].firstPtr = blockPtr->nextBlock;
		cachePtr->buckets[n].numFree--;
		break;
	    }
	}
	if (blockPtr == NULL) {
	    n = NBUCKETS;
	    size = 0; /* lint */
	    while (--n > bucket) {
		if (sharedPtr->buckets[n].numFree > 0) {
		    size = bucketInfo[n].blockSize;
		    LockBucket(n);
		    if (sharedPtr->buckets[n].numFree > 0) {
			blockPtr = sharedPtr->buckets[n].firstPtr;
			sharedPtr->buckets[n].firstPtr = blockPtr->nextBlock;
			sharedPtr->buckets[n].numFree--;
			UnlockBucket(n);
			break;		
		    }
		    UnlockBucket(n);
		}
	    }
	}
	/*
	 * Otherwise, allocate a big new block directly.
	 */

	if (blockPtr == NULL) {
	    size = MAXALLOC;
	    blockPtr = malloc(size);
	    if (blockPtr == NULL) {
		return 0;
	    }
	}

	/*
	 * Split the larger block into smaller blocks for this bucket.
	 */

	n = size / bucketInfo[bucket].blockSize;
	cachePtr->buckets[bucket].numFree = n;
	cachePtr->buckets[bucket].firstPtr = blockPtr;
	while (--n > 0) {
	    blockPtr->nextBlock = (Block *)
		((char *) blockPtr + bucketInfo[bucket].blockSize);
	    blockPtr = blockPtr->nextBlock;
	}
	blockPtr->nextBlock = NULL;
    }
    return 1;
}


/*
 * Local Variables:
 * mode: c
 * c-basic-offset: 4
 * fill-column: 78
 * End:
 */