#include "StdConc.h"
#include "../ConcordLib/IndexSetForLoadingStage.h"
#include "../ConcordLib/ConcIndexator.h"

//#define LARGE_FILE_BUG

#ifdef DETECT_MEMORY_LEAK
	#ifdef _DEBUG
	#define new DEBUG_NEW
	#undef THIS_FILE
	static char THIS_FILE[] = __FILE__;
	#endif
#endif


//===========================================================
//                    class CItemIndexForLoading
//==========================================================

//! initializes vector of  occurrences
bool CItemIndexForLoading::InitOccurs() 
{
	m_pCurrOccurs = new vector<CTokenNo>;
	if (!m_pCurrOccurs)
	{
		fprintf(stderr, "cannot allocate new vector of occurrences in CItemIndexForLoading::InitOccurs\n");
		return false;
	}
	return true;
};

//! deletes vector of  occurrences
void CItemIndexForLoading::FreeOccurs() 
{
	try 
	{
		assert (m_pCurrOccurs);
		delete m_pCurrOccurs;
		m_pCurrOccurs = 0;
	}
	catch (...)
	{
		fprintf (stderr, "Exception in CItemIndexForLoading::FreeOccurs\n");
		throw;
	}
};

//! writes vector of  occurrences to a file
bool CItemIndexForLoading::WriteOccurrences(FILE* fp) const 
{
	return WriteVectorInner(fp, *m_pCurrOccurs);
};

//! checks the order of  occurrences
bool CItemIndexForLoading::CheckOccurrences(CTokenNo EndTokenNo) const 
{
	size_t Count = m_pCurrOccurs->size();

	if (Count == 0) 
	{
		// the set of occurrences can be empty if 
		// all occurrences are masked with "-#"
		return true;
	}

	for (size_t i = 0; i+1 < Count; i++)
		if ((*m_pCurrOccurs)[i] >= (*m_pCurrOccurs)[i+1])
			return false;

	if ((*m_pCurrOccurs).back() >= EndTokenNo)
		return false;
	
	return true;
};

//! clears vector of  occurrences to a file
void CItemIndexForLoading::ClearOccurrences() 
{
	ClearVector(*m_pCurrOccurs);
};


#ifdef LARGE_FILE_BUG
	#pragma optimize( "", off )
#endif

//! read vector of  occurrences from a temporal file 
bool CItemIndexForLoading::ReadFromTemporalFile (FILE* fp)
{
	// reads m_IndexItemOffset
	if ( fread(&m_IndexItemOffset,  sizeof(m_IndexItemOffset),1, fp) != 1) 
		return false;;

	// reads the number  of  occurrences 
	size_t OccCount;
	if ( fread(&OccCount, sizeof(size_t), 1, fp) != 1) 
		throw CExpc(Format("Exception in CItemIndexForLoading::ReadFromTemporalFile: offset %u\n",FTell(fp)));

	m_pCurrOccurs->clear();

	#ifdef LARGE_FILE_BUG
		m_pCurrOccurs->reserve(OccCount);
	#endif

	// reads occurrences  themselves
	ReadVectorInner(fp, *m_pCurrOccurs, OccCount); // if ReadVectorInner cannot read  OccCount items, then it throws an exception

	return true;
};

//! write vector of  occurrences to a temporal file 
void CItemIndexForLoading::WriteToTemporalFile (FILE* fp) const
{
	if (fwrite(&m_IndexItemOffset, sizeof(m_IndexItemOffset), 1 , fp) != 1) 
		throw CExpc ("Exception in CItemIndexForLoading::WriteToTemporalFile(1)");

	size_t OccCount;
	OccCount = m_pCurrOccurs->size();
	if (fwrite(&OccCount, sizeof(size_t), 1, fp) != 1) 
		throw CExpc ("Exception in CItemIndexForLoading::WriteToTemporalFile(2)");

	if (!WriteVectorInner(fp, *m_pCurrOccurs)) 
		throw CExpc (Format("Exception in CItemIndexForLoading::WriteToTemporalFile while writing %i tokens", m_pCurrOccurs->size()));
};

#ifdef LARGE_FILE_BUG
	#pragma optimize( "", on )
#endif

//===========================================================
//                    class CIndexSetForLoadingStage
//==========================================================

CIndexSetForLoadingStage::CIndexSetForLoadingStage ():
m_LoadLess2(&m_StringBuffer), m_LoadLess1(&m_StringBuffer)
{
	m_bUseItemStorage= false;
	m_TempStorageFile = 0;
};


bool CIndexSetForLoadingStage::CreateTempFilesDebug(string Path)
{
	m_MainOccurTempFileName = MakeFName(Path, "tmp_main_"+GetName());
	m_CurrOccurTempFileName = MakeFName(Path, "tmp_curr_"+GetName());
	if (m_bUseItemStorage)
		m_TempStorageFileName = MakeFName(Path, "tmp_storage_"+GetName());
	return true;
};

bool CIndexSetForLoadingStage::CreateTempFiles(string Path)
{
	m_MainOccurTempFileName = MakeFName(Path, "tmp_main_"+GetName());

	FILE* fp =  fopen (m_MainOccurTempFileName.c_str(), "wb");
	if (!fp) return false;
	fclose( fp );

	m_CurrOccurTempFileName = MakeFName(Path, "tmp_curr_"+GetName());
	fp =  fopen (m_CurrOccurTempFileName.c_str(), "wb");
	if (!fp) return false;
	fclose( fp );

	
	
	if (m_bUseItemStorage)
	{
		m_TempStorageFileName = MakeFName(Path, "tmp_storage_"+GetName());
		m_TempStorageFile = fopen( m_TempStorageFileName.c_str(), "wb");
		if (!m_TempStorageFile) return false;
	};

	return true;
};



bool CIndexSetForLoadingStage::DeleteTempFiles ()
{
	bool bResult = true;
	if (FileExists(m_MainOccurTempFileName.c_str()))
	{
		if (remove(m_MainOccurTempFileName.c_str()))
			bResult = false;
	};

	if (FileExists(m_CurrOccurTempFileName.c_str()))
		if (remove(m_CurrOccurTempFileName.c_str()))
		{
			fprintf(stderr, "Cannot remove %s\n", m_CurrOccurTempFileName.c_str());
			bResult = false;
		}
	
	if (m_bUseItemStorage)
	{
		if (m_TempStorageFile)
			fclose (m_TempStorageFile);
		m_TempStorageFile = 0;

		if (FileExists(m_TempStorageFileName.c_str()))
		{
			if (remove(m_TempStorageFileName.c_str()))
			{
				fprintf(stderr, "Cannot remove %s\n", m_TempStorageFileName.c_str());
				bResult = false;
			}
		}
	};

	// clear all load indices
	for (int j = 0; j < 256; j++)
	{
		for (int i = 0; i < m_MemoryLoadIndexHash[j].size(); i++)
				m_MemoryLoadIndexHash[j][i].FreeOccurs();

		m_MemoryLoadIndexHash[j].clear();
	};

		
	return bResult;
};





CIndexSetForLoadingStage::~CIndexSetForLoadingStage ()
{
	DeleteTempFiles();
};



size_t CIndexSetForLoadingStage::GetMemoryLoadIndexItemsCount() const
{
	size_t Result = 0;

	for (int i = 0; i < 256; i++)
		Result += m_MemoryLoadIndexHash[i].size();

	return Result;
};

template<class T>
bool	WriteLoadIndexToTempFileAndClear(FILE* fp, vector<T>& V)
{
	size_t count = V.size();
	for (int i=0; i < count; i++)
	{
		V[i].WriteToTemporalFile(fp);
		V[i].ClearOccurrences();
	};
	return true;
};

bool	CIndexSetForLoadingStage::SaveMemoryLoadIndex()
{
	FILE* fp = 0;
	try {
		fp = fopen(m_CurrOccurTempFileName.c_str(), "wb");
		if (!fp) return false;

		for (int i = 0; i < 256; i++)
		{
			if (!WriteLoadIndexToTempFileAndClear(fp, m_MemoryLoadIndexHash[i]))
			{

				fclose(fp);
				return false;
			}
		}

		fclose(fp);
		return true;
	}
	catch (CExpc c)
	{
		fprintf (stderr, "Exception %s in CIndexSetForLoadingStage::SaveMemoryLoadIndex; Index name=%s\n", c.m_strCause.c_str(), GetName().c_str());
		return false;
	}
	catch(...)
	{
		fprintf (stderr, "Exception in CIndexSetForLoadingStage::SaveMemoryLoadIndex; Index name=%s\n",GetName().c_str());
		if (!fp) fclose (fp);
		return false;
	}
};

bool CIndexSetForLoadingStage::FindIndexItemInVector (const char* Item, vector<CItemIndexForLoading>::iterator& it, vector<CItemIndexForLoading>& V)
{
	it = lower_bound(V.begin(), V.end(),Item, m_LoadLess1);
	return		(it != V.end())
		&& m_LoadLess1.are_equal(*it, Item);
};

//! finds an item in the swap index set, if it is not found, finds the item in the file index  set
bool CIndexSetForLoadingStage::FindIndexItem (const char* Item, vector<CItemIndexForLoading>::iterator& it, int HashNo)
{
	if (FindIndexItemInVector(Item, it, m_MemoryLoadIndexHash[HashNo]))
		return true;

	return FindIndexItemInVector(Item, it, m_InputLoadIndexHash[HashNo]);

};



bool	CIndexSetForLoadingStage::AddToMemoryLoadIndexAndClear(vector<CItemIndexForLoading>& Body, vector<CItemIndexForLoading>& FileIndexSet)
{
	size_t i = 0;

	// special check
	size_t cnt = FileIndexSet.size();
	for (; i+1 < cnt; i++)
		if (!m_LoadLess2(FileIndexSet[i], FileIndexSet[i+1]) )
		{
			fprintf (stderr,"An error occurred in function %s::AddToMemoryLoadIndexAndClear (order error:FileIndexSet is not sorted)\n", GetName().c_str());
			
			//fprintf (stderr,"FileIndexSet[%i] = %s\n", i, GetIndexItemStr(FileIndexSet[i]) );
			//fprintf (stderr,"FileIndexSet[%i] = %s\n", i+1, GetIndexItemStr(FileIndexSet[i+1]) );
			return false;
		};
	// special check

	if (cnt > 0)
	{
		int uuu = 0;
	}
	vector<CItemIndexForLoading>			Result;
	Result.resize(FileIndexSet.size() + Body.size());
	vector<CItemIndexForLoading>::const_iterator it =  set_union(Body.begin(), Body.end(), FileIndexSet.begin(),FileIndexSet.end(), Result.begin(), m_LoadLess2);
	Result.resize( it - Result.begin());
	
	{ // checking 
		
		int debug1 = Result.size();
		int debug2 = FileIndexSet.size() + Body.size();
		if ( Result.size() != FileIndexSet.size() + Body.size())
		{
			fprintf (stderr,"An error occurred in function %s::AddToMemoryLoadIndexAndClear (order error)\n", GetName().c_str());
			return false;
		};
		assert (Result.size() == (FileIndexSet.size() + Body.size()) );
	}
	
	Body =  Result;

	// special check
	cnt = Body.size();
	for (i = 0; i+1 < cnt; i++)
		if (!m_LoadLess2(Body[i], Body[i+1]) )
		{
			fprintf (stderr,"An error occurred in function %s::AddToMemoryLoadIndexAndClear (order error:Body is not sorted)\n", GetName().c_str());
			return false;
		};
	// special check

	FileIndexSet.clear();
	return true;
};

bool	CIndexSetForLoadingStage::AddInputLoadIndexToMemoryLoadIndex()
{
	try 
	{
		for (int i = 0; i < 256; i++)
			if (!AddToMemoryLoadIndexAndClear(m_MemoryLoadIndexHash[i], m_InputLoadIndexHash[i]))
				return false;
	}
	catch (...)
	{
		fprintf (stderr, "Exception in CStringIndexator::AddInputLoadIndexToMemoryLoadIndex; Index Name=%s\n", GetName().c_str());
		return false;
	}

	return true;
};


void	CIndexSetForLoadingStage::SortInputAndMemoryIndices()
{
	for (size_t j=0; j < 256; j++)
		for (size_t k=0; k < m_InputLoadIndexHash[j].size(); k++)
		{
			CItemIndexForLoading& Item = m_InputLoadIndexHash[j][k];
			sort(Item.GetOccurs()->begin(), Item.GetOccurs()->end());
			vector<CTokenNo>::iterator endit = unique(Item.GetOccurs()->begin(), Item.GetOccurs()->end());
			Item.GetOccurs()->erase(endit, Item.GetOccurs()->end());
		};

	for (size_t j=0; j < 256; j++)
		for (size_t k=0; k < m_MemoryLoadIndexHash[j].size(); k++)
		{
			CItemIndexForLoading& Item = m_MemoryLoadIndexHash[j][k];
			sort(Item.GetOccurs()->begin(), Item.GetOccurs()->end());
			vector<CTokenNo>::iterator endit = unique(Item.GetOccurs()->begin(), Item.GetOccurs()->end());
			Item.GetOccurs()->erase(endit, Item.GetOccurs()->end());
		};
};

#ifdef LARGE_FILE_BUG
	#pragma optimize( "", off )
#endif

bool	CIndexSetForLoadingStage::AddMemoryLoadIndexToMainLoadIndex()
{
#ifdef LARGE_FILE_BUG
	test:
#endif

	FILE* curr_fp = 0;
	FILE* main_fp = 0;
	FILE* res_fp = 0;
	int iteration = 0;
	try 
	{
		curr_fp = fopen(m_CurrOccurTempFileName.c_str(), "rb");
		if (!curr_fp) return false;

		main_fp = fopen(m_MainOccurTempFileName.c_str(), "rb");
		if (!main_fp) return  false; 

		string ResultFileName = MakeFName(m_MainOccurTempFileName, "tmp_result");
		res_fp = fopen(ResultFileName.c_str(), "wb");
		if (!res_fp ) return  false;

		CItemIndexForLoading C;
		if (!C.InitOccurs())
			return false;
		CItemIndexForLoading M;
		if (!M.InitOccurs())
			return false;


		C.ReadFromTemporalFile(curr_fp);
		M.ReadFromTemporalFile(main_fp);

		while (!feof(curr_fp)  && !feof(main_fp))
		{
			iteration++;
			if ( C.GetIndexItemOffset() == M.GetIndexItemOffset() )
			{
				if (		!M.GetOccurs()->empty()
						&&	!C.GetOccurs()->empty()
						&&	(M.GetOccurs()->back() >= *C.GetOccurs()->begin())
					)
				{
					// this place is used only if we create morphology index  ("--only-reindex-morph"  switch) 
					vector<CTokenNo> R(C.GetOccurs()->size() + M.GetOccurs()->size());
					vector<CTokenNo>::iterator end =  set_union(
							C.GetOccurs()->begin(), C.GetOccurs()->end(),
							M.GetOccurs()->begin(), M.GetOccurs()->end(), R.begin());
					R.resize(end  - R.begin());
					*M.GetOccurs() = R;

				}
				else
					M.GetOccurs()->insert(M.GetOccurs()->end(), C.GetOccurs()->begin(), C.GetOccurs()->end());

				M.WriteToTemporalFile(res_fp);

				// next  items
				C.ReadFromTemporalFile(curr_fp);
				M.ReadFromTemporalFile(main_fp);
			}
			else
				if (m_LoadLess2(C, M) )
				{
					C.WriteToTemporalFile(res_fp);
					C.ReadFromTemporalFile(curr_fp);
				}
				else
				{
					M.WriteToTemporalFile(res_fp);
					M.ReadFromTemporalFile(main_fp);
				}
		};
		
		iteration = -1;
		
		if (!feof(curr_fp))
		{
			C.WriteToTemporalFile(res_fp);
			while (C.ReadFromTemporalFile(curr_fp))
			{
				iteration--;
				C.WriteToTemporalFile(res_fp);
			};
		}
		else
		if (!feof(main_fp))
		{
			M.WriteToTemporalFile(res_fp);
			while (M.ReadFromTemporalFile(main_fp))
			{
				iteration--;
				M.WriteToTemporalFile(res_fp);
			};
		};

		iteration = 91;
		C.FreeOccurs();
		M.FreeOccurs();

		fclose(curr_fp);
		fclose(main_fp);
		fclose(res_fp);

		iteration = 92;
		if (remove(m_MainOccurTempFileName.c_str()) != 0)
		{
			fprintf (stderr,"Cannot remove file %s\n", m_MainOccurTempFileName.c_str());
			return false;
		};

		iteration = 93;
		if (rename(ResultFileName.c_str(), m_MainOccurTempFileName.c_str()) )
		{
			fprintf (stderr,"Cannot rename file %s\n", ResultFileName.c_str());
			return false;
		};

		return true;

	}
	catch (CExpc c) 
	{
		fprintf (stderr, "Exception %s in CIndexSetForLoadingStage::AddMemoryLoadIndexToMainLoadIndex; Index name=%s; iteration=%i\n",c.m_strCause.c_str(), GetName().c_str(), iteration);
	}
	catch (...) 
	{
		fprintf (stderr, "Exception in CIndexSetForLoadingStage::AddMemoryLoadIndexToMainLoadIndex; Index name=%s; iteration=%i\n",GetName().c_str(), iteration);
	}
	if (curr_fp) fclose(curr_fp);
	if (main_fp) fclose(main_fp);
	if (res_fp) fclose(res_fp);
	#ifdef LARGE_FILE_BUG
		if (!WriteVector(MakeFName(m_MainOccurTempFileName, "debug_string_buffer").c_str(), m_StringBuffer)) return false;
		goto test;
	#endif	
	return false;

};

#ifdef LARGE_FILE_BUG
	#pragma optimize( "", on )
#endif

void CIndexSetForLoadingStage::AddItemStrToBuffer(const char* Str, size_t StrLen)
{
	//  including the last 0
	for (long i=0; i <= StrLen; i++)
		m_StringBuffer.push_back(Str[i]);
};



void CIndexSetForLoadingStage::InsertToInputLoadIndex(const char* Str, size_t StrLen, const vector<CTokenNo>& occurrences)
{
	const char AnonymousFlag[] = "-#";
	bool bHasAnonymousFlag = false;
	if (!strncmp(Str,AnonymousFlag,2))
	{
		Str    += strlen(AnonymousFlag);
		StrLen -= strlen(AnonymousFlag);
		bHasAnonymousFlag = true;
	};

	int HashNo = (unsigned char)Str[0];
	vector<CItemIndexForLoading>::iterator it;
	DWORD IndexItemOffset;


	
	if (!FindIndexItem(Str, it, HashNo) )
	{
		// creating new index item
		CItemIndexForLoading word;
		if (!word.InitOccurs())
			throw CExpc ("cannot allocate new vector of occurrences in CItemIndexForLoading::InitOccurs");

		if (!bHasAnonymousFlag)
			*word.GetOccurs() = occurrences;
	
		{
			size_t buf_len = m_StringBuffer.size();
			word.SetIndexItemOffset(buf_len);
			if (buf_len > 500000000) // 500 MB
			{
				throw CExpc (Format("index set(name = \"%s\") cannot contain more than 500 MB of different tokens", this->GetName().c_str()));
			}
		}
		AddItemStrToBuffer(Str, StrLen);
		m_InputLoadIndexHash[HashNo].insert(it, word);
		IndexItemOffset = word.GetIndexItemOffset();
	}
	else
		{
			// adding new occurrence to the existing index item
			if	(		(it->GetOccurs()->empty())
					||	(it->GetOccurs()->back() != occurrences[0])
				)

			{
				if (!bHasAnonymousFlag)
					it->GetOccurs()->insert(it->GetOccurs()->end(),  occurrences.begin(), occurrences.end());
			}
			
			IndexItemOffset = it->GetIndexItemOffset();
		}

	if (m_bUseItemStorage)
	{
		if (fwrite(&IndexItemOffset,sizeof(DWORD),1, m_TempStorageFile) != 1) 
			throw CExpc ("Cannot write to temporary storage file");
	};
};



bool	CIndexSetForLoadingStage::ConvertTempStorageToPersistent(string PersistentFileName)
{
	assert (m_bUseItemStorage);

	// if the procedure is called from ConcordAdd, then 
	//  the persistent file should be already built and m_TempStorageFile is null, no other actions
	if (!m_TempStorageFile) return true;

	

	map<DWORD,DWORD> BufferOffset2TokenNo;
	// create a map from buffer offset to IndexItemNo
	{
		size_t No = 0;
		for (size_t j=0; j < 256; j++)
			for (size_t k=0; k < m_MemoryLoadIndexHash[j].size(); k++)
			{
				BufferOffset2TokenNo[m_MemoryLoadIndexHash[j][k].GetIndexItemOffset()] = No;
				No++;
			};
	};
	
	
	if (m_TempStorageFile) fclose (m_TempStorageFile);
	m_TempStorageFile = fopen(m_TempStorageFileName.c_str(), "rb");
	FILE * out_fp = fopen(PersistentFileName.c_str(), "wb");
	if (!out_fp) return false;
	DWORD BufferOffset;
	while (fread(&BufferOffset, 4, 1, m_TempStorageFile) == 1)
	{
		map<DWORD,DWORD>::const_iterator it = BufferOffset2TokenNo.find(BufferOffset);
		if (it == BufferOffset2TokenNo.end()) 
		{
			fclose (out_fp);
			return false;
		}
		if (fwrite(&(it->second), 4, 1, out_fp) != 1) 
		{
			fclose (out_fp);
			return false;
		};
	};

	fclose (m_TempStorageFile);
	m_TempStorageFile = 0;
	fclose (out_fp);
	return true;
};

