#include "StdConc.h"
#include "IndexSetForQueryingStage.h"
#include "ConcordAlgorithm.h"
#include "StringIndexator.h"
#include <fcntl.h>

#ifdef DETECT_MEMORY_LEAK
	#ifdef _DEBUG
	#define new DEBUG_NEW
	#undef THIS_FILE
	static char THIS_FILE[] = __FILE__;
	#endif
#endif

const size_t MaxArchiveRatio = 5;
size_t AllBytesRead =  0;



CIndexSetForQueryingStage::CIndexSetForQueryingStage(const CStringIndexator* pParent)
{
	m_OccursFp = 0;
	m_pParent  = pParent;
	assert (m_pParent);
};

void CIndexSetForQueryingStage::CloseOccursFile()
{
	if (m_OccursFp)
	{
		fclose(m_OccursFp);
		m_OccursFp = 0;
	};
};

CIndexSetForQueryingStage::~CIndexSetForQueryingStage()
{
	CloseOccursFile();
};

void CIndexSetForQueryingStage::AssertHasPath() const
{
	assert(m_pParent->m_Path != "#empty_path");
	if (m_pParent->m_Path == "#empty_path")
	{
		ErrorMessage("Uninitialized path for index files");
		throw CExpc("Exception: Uninitialized path for index files", errUnknownPath);
	};

};

bool CIndexSetForQueryingStage::DestroyIndexSet( )
{	
	AssertHasPath();
	ClearVector(m_Index);
	m_EndPeriodOffsets.clear();

	CloseOccursFile();

	if (FileExists(GetOccHdrFileName().c_str()))
		if (remove (GetOccHdrFileName().c_str() ) )
			return false;

	if (FileExists(GetOccursFileName().c_str() ))
		if (remove (GetOccursFileName().c_str() ))
			return false;

	if (FileExists(GetPeriodsDevisionFileName().c_str() ) )
		if (remove (GetPeriodsDevisionFileName().c_str()))
			return false;

	if (FileExists(GetFileNameForInfos().c_str()))
		if (remove (GetFileNameForInfos().c_str() ))
			return false;

	return true;
	
};


bool	CIndexSetForQueryingStage::LoadPeriodDevision()
{
	m_EndPeriodOffsets.clear();

	size_t CountOfPeriods = m_pParent->GetSearchPeriodsCount();
	string FName = GetPeriodsDevisionFileName();
	file_off_t FSize = FileSize(FName.c_str());
	FILE * fp = fopen (FName.c_str(), "rb");
	if (!fp) return false;
	try {
		int count =  (FSize/(CountOfPeriods+1)) / sizeof(size_t); // "+1" is for the first IndexItemOffset
		vector<DWORD> EndOffsets;
		EndOffsets.resize(CountOfPeriods);
		for (int i = 0; i < count; i++)
		{
			size_t t;
			if (fread(&t,sizeof(t), 1, fp) != 1) return false;
			ReadVectorInner(fp, EndOffsets, CountOfPeriods);
			if (EndOffsets.empty())
			{
				fprintf (stderr, "Cannot read period devision for IndexItemOffset = %i\n", t);
				return false;
			};
			m_EndPeriodOffsets[t] = EndOffsets;
		};
		fclose(fp);
		//printf("From file  %s %i vectors were read ", FName.c_str(), count);
		return true;
	}
	catch (CExpc c)
	{
		fprintf (stderr, "Exception %s in CIndexSetForQueryingStage::LoadPeriodDevision\n",c.m_strCause.c_str());
		return false;
	}
	catch (...)
	{
		fprintf (stderr, "Exception in CIndexSetForQueryingStage::LoadPeriodDevision\n");
		return false;
	}

};

bool	CIndexSetForQueryingStage::LoadIndexSet()
{
	CloseOccursFile();
	m_OccursFp = fopen(GetOccursFileName().c_str(), "rb");
	if (!m_OccursFp) return false;
	// disable buffer for fread under Windows, under Linuc this setvbuf could only slow down the program
	#ifdef	WIN32
		setvbuf(m_OccursFp, 0, _IONBF, 0);
	#endif


	ReadVector(GetOccHdrFileName().c_str(), m_Index);

	if (!LoadPeriodDevision()) return false;
	return true;
}



void DearchiveOccurrences(const DWORD* in_start, const DWORD* in_end,  DWORD* out_start, DWORD& OutCount, bool bEnabled)
{
	if (!bEnabled)
	{
		OutCount = in_end - in_start;
		memcpy(out_start, in_start, OutCount*sizeof(DWORD));
	}
	else
	{
		DeconvertFromVariableInteger(in_start, in_end,  out_start, OutCount);
		//DearchiveSequence(in_start, in_end,  out_start, OutCount);
	}
};


void CIndexSetForQueryingStage::ReadOccurrences (CTokenNo* OutBuffer, file_off_t FilePosition, size_t Count) const
{
	AllBytesRead +=  Count * sizeof(CTokenNo);
	if (!FSeek(m_OccursFp, FilePosition, SEEK_SET))
	{
		string Error = Format("Fatal error! Cannot seek to offset %i in %s\n", FilePosition, GetOccursFileName().c_str());
		throw CExpc(Error, errReadOccurrenceFile);
	};
	

	size_t act_count = fread (OutBuffer, sizeof(CTokenNo), Count, m_OccursFp);
	if (act_count != Count)
	{
		string Error = Format("Fatal error! Cannot read from offset %i  %i dwords from %s\n", FilePosition, Count, GetOccursFileName().c_str());
		throw CExpc(Error, errReadOccurrenceFile);
	};
};





/*
 AddLongOccursOccurs reads from m_OccursFp occurrences. Input parameters are:
	IndexItemNo - the index item, whose occurrences we are going to read
	Occurs - output vector of occurrences, we will add  occurrences to the end of this vector
	PeriodNo - the number of subcorpora in which we should read occurrences.
	Buffer - an auxiliary buffer in which we read tokens for the first time(We use this buffer for performance, 
			for simplicity it can be declared inside AddLongOccursOccurs)
*/
void CIndexSetForQueryingStage::AddLongOccursOccurs (size_t IndexItemNo, vector<CTokenNo>& Occurs, size_t PeriodNo, COccurrBuffer& OccurrsBuffer) const
{
	const CIndexItem& IndexItem = m_Index[IndexItemNo];
	size_t  IndexItemOffset = IndexItem.GetIndexItemOffset();
	PeriodsDivisionMap::const_iterator it = m_EndPeriodOffsets.find(IndexItemOffset);
	assert (it != m_EndPeriodOffsets.end());
	const vector<DWORD>& Periods = it->second;
	assert (Periods.size() == m_pParent->GetSearchPeriodsCount());

	QWORD Start  = GetStartOccurNo(IndexItemNo);
	if (PeriodNo > 0) 
		Start  += Periods[PeriodNo - 1];
	QWORD End  = GetStartOccurNo(IndexItemNo)+Periods[PeriodNo];;

	QWORD MaxCountToRead = End-Start;
	if  (MaxCountToRead == 0) return;
	

	// reading from the file
	DWORD DearchivedSize;
	OccurrsBuffer.resize(MaxCountToRead);

	ReadOccurrences(&OccurrsBuffer[0], Start*(QWORD)sizeof(CTokenNo), MaxCountToRead);
	
	// dearchiving 
	DWORD old_size =  Occurs.size();
	Occurs.resize(Occurs.size() + MaxCountToRead*MaxArchiveRatio);
	DearchiveOccurrences(&OccurrsBuffer[0], &OccurrsBuffer[0]+OccurrsBuffer.size(),  &Occurs[0] + old_size, DearchivedSize, m_bArchiveOccurrences);

	// adjusting the size  of  Occurs, since we have reserved  too much space   (MaxCountToRead*3)
	Occurs.resize(old_size + DearchivedSize);

};





size_t CIndexSetForQueryingStage::GetStartOccurNo(size_t IndexNo) const
{
	if (IndexNo == 0) return 0;

	// we should  pass all index items  which are occurred only one time in corpora (they take no space in m_OccursFp)
	for (IndexNo--; IndexNo > 0; IndexNo--)
	{
		if (!(m_Index[IndexNo].GetItemIndexFlags() & TheOnlyOccurIsInEndOccurNo))
			return m_Index[IndexNo].GetEndOccurOffset();
	};

	assert (IndexNo == 0);

	if (m_Index[0].GetItemIndexFlags() & TheOnlyOccurIsInEndOccurNo)
		return 0;
	else
		return m_Index[IndexNo].GetEndOccurOffset();
};

void  CIndexSetForQueryingStage::AddOccurs (size_t IndexItemNo, vector<CTokenNo>& Occurs, size_t PeriodNo, COccurrBuffer& TempOccurrsBuffer, CShortOccurCache* pCacheByIndexSet, int& CacheId) const
{
	
	const CIndexItem& IndexItem = m_Index[IndexItemNo];
	const CTokenNo StartSearchPeriod = (PeriodNo == 0)? 0: m_pParent->GetSearchPeriod(PeriodNo-1);
	const CTokenNo EndSearchPeriod =  m_pParent->GetSearchPeriod(PeriodNo);
	//  if there is only one occurrence if this index item  in the corpora
	// then  w>e should get from IndexItem.m_EndOccurOffset
	if (IndexItem.GetItemIndexFlags() & TheOnlyOccurIsInEndOccurNo)
	{
		
		CTokenNo T = IndexItem.GetEndOccurOffset();
		if	(	(T >= StartSearchPeriod)	
			  &&(T < EndSearchPeriod)
			)
			Occurs.push_back(T);
	}
	else
	//  if no period devision of this token, then read all occurrences from all periods
	if (m_EndPeriodOffsets.find(IndexItem.GetIndexItemOffset()) ==  m_EndPeriodOffsets.end())
	{
		QWORD Start = GetStartOccurNo(IndexItemNo);
		QWORD End  = IndexItem.GetEndOccurOffset();
		DWORD MaxCountToRead = End-Start;

		const CTokenNo* pBuffer;
		TempOccurrsBuffer.resize(MaxCountToRead);
		
		if (pCacheByIndexSet && CacheId != -1)
		{
			pBuffer = pCacheByIndexSet->GetOccurrencesFromCache(CacheId, MaxCountToRead);
		}
		else
		{
			ReadOccurrences (&TempOccurrsBuffer[0], Start*(QWORD)sizeof(CTokenNo), MaxCountToRead);
			pBuffer = &TempOccurrsBuffer[0];

			// dearchiving the occurrences ( if !m_bArchiveOccurrences then it is just a copy )
			CTokenNo NewBuffer[OccurBufferSize*MaxArchiveRatio];

			// MaxCountToRead is going to be changed, if the index is archived  
			// it will contain the real number of poistions for this  IndexItemNo
			DearchiveOccurrences(pBuffer, pBuffer  + MaxCountToRead,  NewBuffer, MaxCountToRead, m_bArchiveOccurrences);
			pBuffer = NewBuffer;

			if (pCacheByIndexSet && pCacheByIndexSet->CouldContainMore() )
			{
				//  inserting new cache item, if the cache is not too large
				CacheId = pCacheByIndexSet->AddNewIndexItemNoToCache(pBuffer, pBuffer+MaxCountToRead);
			};
		}
		
		// finding start an end position of the search period 
		const CTokenNo* start_it = lower_bound(pBuffer, pBuffer+MaxCountToRead,StartSearchPeriod);
		const CTokenNo* end_it = lower_bound(start_it, pBuffer+MaxCountToRead,EndSearchPeriod);

		// adding the found occurrences to the output vector
		Occurs.insert(Occurs.end(), start_it, end_it);
	}
	else
	{
			AddLongOccursOccurs(IndexItemNo, Occurs, PeriodNo, TempOccurrsBuffer);
	};
	
};


void CIndexSetForQueryingStage::ReadAllOccurrences (size_t IndexItemNo, vector<CTokenNo>& Occurs) const
{
	Occurs.clear();
	COccurrBuffer TempBuffer;
	int CacheId;
	for (size_t i=0; i < m_pParent->GetSearchPeriodsCount(); i++)
	{
		CacheId = -1;
		AddOccurs(IndexItemNo, Occurs, i, TempBuffer, 0, CacheId);
	};
};




string CIndexSetForQueryingStage::GetOccHdrFileName() const
{
	AssertHasPath();
	return MakeFName(m_pParent->m_Path,"_occ_hdr_")  + GetName();
};

string CIndexSetForQueryingStage::GetOccursFileName() const
{
	AssertHasPath();
	return MakeFName(m_pParent->m_Path,"_occurs_")  + GetName();
};
string	CIndexSetForQueryingStage::GetPeriodsDevisionFileName () const
{
	AssertHasPath();
	return  MakeFName(m_pParent->m_Path,"_perdiv") + GetName();
};
string	CIndexSetForQueryingStage::GetFileNameForInfos() const
{
	AssertHasPath();
	return MakeFName(m_pParent->m_Path,string("_") + GetName() );
};

file_off_t	CIndexSetForQueryingStage::GetOccurrsFileSize() const
{
	AssertHasPath();
	return FileSize(GetOccursFileName().c_str());
};


