#include "core/ThreadProfiler.h"

#if RDE_THREAD_PROFILER_ENABLED

#pragma message("INFO: Thread profiler enabled")

#include "core/LockGuard.h"
#include "core/Mutex.h"
#include "core/RdeAssert.h"
#include "core/StackTrace.h"
#include "core/System.h"
#include "core/Thread.h"
#include "core/Timer.h"
#include "core/win32/Windows.h"
#include <vector>
#include <algorithm>

namespace
{
// RDTSC is quicker, but less reliable, not recommended.
#define USE_RDTSC	0

	rde::uint64_t RDE_FORCEINLINE GetTicks()
	{
#if USE_RDTSC
		__asm
		{
			xor	eax, eax
			cpuid
			rdtsc
		}
#else
		return rde::Timer::Now();
#endif
	}
	double GetTicksPerSecond()
	{
		static double clockSpeed(0.0);
		if (clockSpeed > 0.0)
			return clockSpeed;
#if USE_RDTSC
		__int64 countsPerSecond(0);
		QueryPerformanceFrequency(reinterpret_cast< LARGE_INTEGER* >(&countsPerSecond));
		LARGE_INTEGER timeStart;
		QueryPerformanceCounter(&timeStart);

		LARGE_INTEGER timeEnd;
		timeEnd.QuadPart = 0;
		static const LONGLONG kWaitCounts = 500000;
		unsigned __int64 ticksStart = GetTicks();
		while (timeEnd.QuadPart < timeStart.QuadPart + kWaitCounts)
		{
			QueryPerformanceCounter(&timeEnd);
		}
		unsigned __int64 ticksTaken = GetTicks() - ticksStart;
		const double secondsPassed = double(countsPerSecond) / kWaitCounts;
		clockSpeed = double(ticksTaken) * secondsPassed;
		
		return clockSpeed;
#else
		LARGE_INTEGER freq;
		const bool ok = QueryPerformanceFrequency(&freq) != FALSE;
		RDE_ASSERT(ok);
		clockSpeed = double(freq.QuadPart);
		return clockSpeed;
#endif
	}
	#pragma pack(push, 1)
	struct ThreadObject
	{
		static const int kMaxCallStack	= 10;
		const void*					address;
		rde::StackTrace::Address	callStack[kMaxCallStack];
	};
	struct ThreadProfileEvent
	{
		bool operator<(const ThreadProfileEvent& rhs) const
		{
			return ticks < rhs.ticks;
		}

		rde::ThreadProfiler::Event::Enum	type;
		const void*							userData;
		int									threadId;
		rde::uint64_t						ticks;
	};
	#pragma pack(pop)
	struct ThreadDesc
	{
		int		id;
		char	name[64];
	};

	template<typename T, int TMaxSize>
	struct GrowingArray
	{
		const T& operator[](int i) const
		{
			return m_items[i];
		}
		void PushBack(const T& t)
		{
			if (m_size >= m_capacity)
				Grow();
			m_items[m_size++] = t;
			// Wrap around. We record time moments, so it's safe.
			if (m_size >= TMaxSize)
			{
				m_maxSize = m_size;
				m_size = 0;
			}
		}
		// @note	Assumes data can be copied with memcpy!
		void Grow()
		{
			const int newCapacity = (m_capacity == 0 ? 4096 : m_capacity * 2);
			T* newItems = new T[newCapacity];
			rde::Sys::MemCpy(newItems, m_items, m_capacity * sizeof(T));
			delete[] m_items;
			m_items = newItems;
			m_capacity = newCapacity;
		}

		T*		m_items;
		int		m_size;
		int		m_maxSize;
		int		m_capacity;
	};

	typedef GrowingArray<ThreadProfileEvent, 512 * 1024 * 1024>	ThreadProfileEventArray;
	typedef GrowingArray<ThreadObject, 8192>					ThreadObjects;
	RDE_THREADLOCAL ThreadProfileEventArray	t_events = { 0, 0, 0, 0 };
	RDE_THREADLOCAL ThreadObjects			t_objects = { 0, 0, 0, 0 };
	rde::uint64_t							s_baseTicks = 0;
	std::vector<ThreadProfileEvent>			s_allEvents;
	std::vector<ThreadObject>				s_allObjects;
	std::vector<ThreadDesc>					s_allThreads;
	rde::Mutex								s_allObjectsMutex;
	rde::Mutex								s_allEventsMutex;
}
namespace rde
{
void ThreadProfiler::AddObject(void* obj)
{
	::ThreadObject o;
	o.address = obj;
	rde::StackTrace::GetCallStack_Slow(o.callStack, o.kMaxCallStack, 1);
	t_objects.PushBack(o);
}

void ThreadProfiler::AddEvent(Event::Enum type, const void* userData /*= 0*/)
{
	if (s_baseTicks == 0)
		s_baseTicks = ::GetTicks();

	::ThreadProfileEvent e = { type, userData, 
		Thread::GetCurrentThreadId(), ::GetTicks() - s_baseTicks };
	t_events.PushBack(e);
}

void ThreadProfiler::SubmitEvents()
{
	{
		LockGuard<Mutex> lock(s_allObjectsMutex);
		const char* threadName = Thread::GetCurrentThreadName();
		const int threadId = Thread::GetCurrentThreadId();
		::ThreadDesc desc;
		strcpy_s(desc.name, threadName && *threadName ? threadName : "<no name>");
		desc.id = threadId;
		s_allThreads.push_back(desc);
	}

	const int numEvents = 
		t_events.m_maxSize > t_events.m_size ? t_events.m_maxSize : t_events.m_size;
	for (int i = 0; i < numEvents; ++i)
	{
		LockGuard<Mutex> lock(s_allEventsMutex);
		s_allEvents.push_back(t_events[i]);
	}
	const int numObjects = 
		t_objects.m_maxSize > t_objects.m_size ? t_objects.m_maxSize : t_objects.m_size;
	for (int i = 0; i < numObjects; ++i)
	{
		LockGuard<Mutex> lock(s_allObjectsMutex);
		s_allObjects.push_back(t_objects[i]);
	}
}

bool ThreadProfiler::SaveEvents(const char* fileName)
{
	FILE* f = fopen(fileName, "wb");
	if (!f)
		return false;

	const double ticksPerSecond = ::GetTicksPerSecond();
	::fwrite(&ticksPerSecond, sizeof(ticksPerSecond), 1, f);

	StackTrace::ModuleDesc desc;
	StackTrace::GetMainModuleDesc(desc);
	::fwrite(&desc.baseAddress, sizeof(desc.baseAddress), 1, f);
	::fwrite(&desc.size, sizeof(desc.size), 1, f);
	::fwrite(&desc.pdbName[0], sizeof(desc.pdbName), 1, f);

	{
		LockGuard<Mutex> lock(s_allObjectsMutex);
		const size_t numThreads = s_allThreads.size();
		::fwrite(&numThreads, sizeof(numThreads), 1, f);
		for (size_t i = 0; i < numThreads; ++i)
		{
			const ::ThreadDesc& desc = s_allThreads[i];
			::fwrite(&desc.id, sizeof(desc.id), 1, f);
			::fwrite(&desc.name[0], sizeof(desc.name), 1, f);
		}

		const size_t numObjects = s_allObjects.size();
		::fwrite(&numObjects, sizeof(numObjects), 1, f);
		::fwrite(&s_allObjects[0], sizeof(ThreadObject), numObjects, f);
	}
	{
		LockGuard<Mutex> lock(s_allEventsMutex);
		std::sort(s_allEvents.begin(), s_allEvents.end());
		const size_t numEvents = s_allEvents.size();
		::fwrite(&numEvents, sizeof(numEvents), 1, f);
		for (size_t i = 0; i < numEvents; ++i)
		{
			const ::ThreadProfileEvent& e = s_allEvents[i];
			::fwrite(&e, sizeof(e), 1, f);
		}
	}
	::fclose(f);
	return true;
}
}

#endif // RDE_THREAD_PROFILER_ENABLED

