/*
This product contains certain software code or other information
("AT&T Software") proprietary to AT&T Corp. ("AT&T").  The AT&T
Software is provided to you "AS IS".  YOU ASSUME TOTAL RESPONSIBILITY
AND RISK FOR USE OF THE AT&T SOFTWARE.  AT&T DOES NOT MAKE, AND
EXPRESSLY DISCLAIMS, ANY EXPRESS OR IMPLIED WARRANTIES OF ANY KIND
WHATSOEVER, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, WARRANTIES OF
TITLE OR NON-INFRINGEMENT OF ANY INTELLECTUAL PROPERTY RIGHTS, ANY
WARRANTIES ARISING BY USAGE OF TRADE, COURSE OF DEALING OR COURSE OF
PERFORMANCE, OR ANY WARRANTY THAT THE AT&T SOFTWARE IS "ERROR FREE" OR
WILL MEET YOUR REQUIREMENTS.

Unless you accept a license to use the AT&T Software, you shall not
reverse compile, disassemble or otherwise reverse engineer this
product to ascertain the source code for any AT&T Software.

(c) AT&T Corp. All rights reserved.  AT&T is a registered trademark of AT&T Corp.

***********************************************************************

History:

      24/11/99  - initial release by Hartmut Liefke, liefke@seas.upenn.edu
                                     Dan Suciu,      suciu@research.att.com
*/

//**************************************************************************
//**************************************************************************

// This module contains the Enumeration-Compressor 'e'
// The enumeration-compressor (or dictionary compressor) assigns a
// positive integer to each possible string occured. In the compressor,
// a dictionary of all previously seen strings is maintained and a hashtable
// allows the efficient lookup. A new
// string is assigned the next free number.

// The decompressor loads the dictionary and directly accesses the
// dictionary for a given integer.

// Note that there is *only one* hashtable for all enum-compressors !!
// I.e. all new strings are represented in the same hash tableindex number space !

#pragma once

#include "CompressMan.hpp"
#include "MemMan.hpp"
#include "UnCompCont.hpp"
#include "SmallUncompress.hpp"
#include "XMill.h"

#define SMALLCOMPRESS_THRESHOLD  1024

// The MemStreamer used for storing the hashtable entries in the compressor
// and the compressor states in the decompressor
#define enumcompressmem (session->blockmem)

//*************************************************************

// The compressor is based on a hash table implementation

// The size of the hash table
#define ENUMHASHTABLE_SIZE  32768
#define ENUMHASHTABLE_MASK  32767
#define ENUMHASHTABLE_SHIFT 15

struct EnumCompressState;

struct EnumHashEntry
   // Stores a hash entry
{
   EnumHashEntry     *nextsamehash; // The next entry with the same hash index
   unsigned short    datalen;       // The length of the string
   char              *dataptr;      // The pointer to the string
                                    // Strings are stored in a separate MemStreamer!
   EnumCompressState *enumstate;    // The pointer to the state of the compressor
   unsigned          localidx;      // The index number for this string
                                    // This number is unique within a given compressor state

#ifdef NOTHREAD
   void *operator new(size_t size);
#endif
	char *GetStrPtr();
};

class EnumHashTable
   // The hash table implementation
{
   EnumHashEntry    *hashtable[ENUMHASHTABLE_SIZE];
   char             isinitialized;

   static inline unsigned CalcHashIdx(char *str,int len);
      // Computes the hash index for a given string
public:
   EnumHashTable();
   void Initialize();
      // The hash table is emptied
   void Reset();
      // This will cause the hash table to be emptied next time we try to
      // add elements
   EnumHashEntry *FindOrCreateEntry(char *str,int len,EnumCompressState *enumstate,char *isnew,MemStreamer *strmem);
      // Finds or creates a new hash entry
      // enumstate is the state for the compressor, *isnew will be set to 1 if a new entry has been created
      // strmem is the MemStreamer used for allocating string memory space
};

// The actual compressor implementation follows now
struct EnumCompressState
   // The state for each enum-compressor 
{
   unsigned long     curidx;              // The number of already assigned strings
                                          // This is also the next available index
   MemStreamer       stringmem;           // The memory streamer used for storing
                                          // the strings
   unsigned long     compressed_size;     // We keep track of the number of compressed bytes ...
   unsigned long     uncompressed_size;   // ... and uncompressed bytes
   EnumCompressState *next;               // All enumstates are kept in a global list
                                          // Points to the next enumstate in the global list
};

//********************************************************************************

class EnumerationCompressor : public UserCompressor
   // The actual compressor
{
public:
	EnumerationCompressor();
   EnumerationCompressor(Session *s);
	void Init();
   void InitCompress(CompressContainer *cont,char *dataptr);
      // Initializes the compressor the specific enum state 'dataptr'
   void CompressString(char *str,unsigned len,CompressContainer *cont,char *dataptr);
      // Compresses a specific string item

   // Note that we don't implement the 'FinishCompress' method for UserCompressors.
   // The reason is that we do the actual storage in the EnumCompressFactory-class.

   void PrintCompressInfo(char *dataptr,unsigned long *overalluncomprsize,unsigned long *overallcomprsize);
      // Prints statistical information about how well the compressor compressed
      // the data
};

// The Enum-Decompress
struct EnumDictItem
   // Represents an entry in the dictionary
   // The entries are stored in a sequence in memory
{
   unsigned long  len;
   unsigned char  *dataptr;
};

struct EnumUncompressState
   // The state of the decompressor
{
   unsigned long  itemnum;       // The number of items
   unsigned long  size;          // The size of the dictionary
   EnumDictItem   *itemarray;    // The array of dictionary items
   unsigned char  *strbuf;       // The pointer to the string buffer
};

class EnumerationUncompressor : public UserUncompressor
{
   EnumUncompressState *GetNextPossibleEnumUnCompressState();
      // This auxiliary function returns the next state from the sequence of states
      // that was previously loaded from the compressed file
      // The sequence of states is stored in the EnumCompressFactory-object
public:
   EnumerationUncompressor(Session *s);
   EnumerationUncompressor();
	void Init();
   void InitUncompress(UncompressContainer *cont,char *dataptr);
      // Initializes the compressor by simply retrieving the next
      // state from the list of states
   void UncompressItem(UncompressContainer *cont,char *dataptr,XMLOutput *output);
      // An item is decompressed by looking up the dictionary
};

//**************************************************************************

class EnumerationCompressorFactory : public UserCompressorFactory
   // The actual enum compressor factory
{
protected:
   unsigned                   enuminstancecount;   // The number of enum compressor instantiations

public:
	EnumerationCompressorFactory(Session *s,int comp);
   char *GetName();
   char *GetDescription();
   char IsRejecting();
   char CanOverlap();
};

class CompEnumerationCompressorFactory: public EnumerationCompressorFactory
{
   EnumerationCompressor      enumcompress;     // We need only one compressor instance
   
   EnumCompressState          *enumstatelist,   // The global list of enumstates
                              **lastenumstateref;

public:
	CompEnumerationCompressorFactory(Session *s);
   void AddEnumCompressState(EnumCompressState *state);
      // Adds a new enumstate to the global list
   UserCompressor *InstantiateCompressor(char *paramstr,int len);
      // The instantiation simply return the one instance we have
	// The compression/decompression routines for the factory
   // CompressFactoriess are also allowed to store status information
   // in the compressed file ! The following procedure is used for
   // compressing/decompressing this information:
   // Small data (<1024Bytes) is stored in the header, while
   // large data is stored in separate zlib-blocks in the output file
   void CompressSmallGlobalData(Compressor *compressor);
      // Compresses the small data
   void CompressLargeGlobalData(Output *output);
      // Compresses the large dictionaries
      // Furthermore, we also release all the memory of all (also the small) dictionaries
   unsigned long GetGlobalDataSize();
      // Determines how much memory we need for the dictionaries
      // This information is later used in the decompression to allocate
      // the appropriate amount of memory
};

class DecompEnumerationCompressorFactory: public EnumerationCompressorFactory
{
   EnumerationUncompressor    enumuncompress;   // We need only one decompressor instance

   // We keep a temporary array of states for the decompressors
   // The factory fills the array by decompressing the status information
   // from the input file. The actual states of the decompressors are then
   // initialized with these states
   EnumUncompressState        *enumuncompressstates;        
   unsigned long              activeenumuncompressstates;   // The number of states in the array

public:
	DecompEnumerationCompressorFactory(Session *s);
   UserUncompressor *InstantiateUncompressor(char *paramstr,int len);
      // The instantiation simply return the one instance we have
	void UncompressSmallGlobalData(SmallBlockUncompressor *uncompressor);
   void UncompressLargeGlobalData(Input *input);
      // Uncompresses the large dictionaries
   EnumUncompressState *GetNextPossibleEnumUnCompressState();
      // Retrieves the next state from the sequence of states
      // The next call retrieves the next state and so on.
   void FinishUncompress();
      // Releases the memory after decompression
};
