
/* ************************************************************************ *
 *              Written by Alex de Kruijff                2009              *
 * ************************************************************************ *
 * This source was written with a tabstop every four characters             *
 * In vi type :set ts=4                                                     *
 * ************************************************************************ */

#include "configure.h"
#include "hash.h"
#include "toolkit.h"
#include "visitor.h"
#include "storage.h"
#include "filegroup.h"
#include "matchmatrix.h"
#include "sizegroup.h"
#include "list.h"

#include <fcntl.h>
#include <stdio.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/uio.h>
#include <unistd.h>

#include <new>

#ifndef FILE_BY_LOGIC
#define FILE_BY_LOGIC FILE_USER1
#endif

/* If READ_ONLY_ONCES is defined the code will read each file only ones.
 * If READ_ONLY_ONCES is not defined the code will use less memory.
 */
#ifdef READ_ONLY_ONCES
static class SameGroupBuffer
{
	size_t n, size, pagesize;
	char *buf;

public:
	SameGroupBuffer();
	~SameGroupBuffer();
	void setN(size_t);

	size_t getPageSize() { return pagesize; }

	char *operator[](size_t index);
} buffer;

SameGroupBuffer::SameGroupBuffer()
{
	size = EXPECTED_MAX_GROUP;
#ifdef _SC_PAGESIZE
	size *= (size_t)sysconf(_SC_PAGESIZE);
#else // _SC_PAGESIZE
	size *= 4096;
#endif // _SC_PAGESIZE
#ifdef LOW_MEMORY_PROFILE
	if (size > 1048576)
		size = 1048576;
#endif // LOW_MEMORY_PROFILE
	buf = new char[size];
}

SameGroupBuffer::~SameGroupBuffer()
{
	delete buf;
}

void SameGroupBuffer::setN(size_t n)
{
	this->n = n;
	pagesize = size / n;
#ifdef _SC_PAGESIZE
	if (pagesize > (size_t)sysconf(_SC_PAGESIZE))
		size = (size_t)sysconf(_SC_PAGESIZE);
#else // _SC_PAGESIZE
	if (pagesize > 32 * 1024) // 32 KB
		pagesize = 32 * 1024;
#endif // _SC_PAGESIZE
}

char *SameGroupBuffer::operator[](size_t index)
{
#ifdef DEBUG
	if (index >= n)
	{
			fprintf(stderr, "%s:%d there aren't that many indexes\n",
				__FILE__, __LINE__);
			exit(EXIT_FAILURE);
	}
#endif // DEBUG
	return buf + index * pagesize;
}
#endif // READ_ONLY_ONCES

FileGroup SizeGroup::tmp;

int SizeGroup::compare(const void *a, const void *b) throw()
{
	return (**(SizeGroup **)b).fileSize - (**(SizeGroup **)a).fileSize;
}

/* ************************************************************************ */

hash_t SizeGroup::hashFunction(const SizeGroup &obj) throw()
{
	return hashword((hash_t *)&obj.fileSize, sizeof(off_t) / sizeof(hash_t));
}

FileGroup &SizeGroup::operator[](const struct stat &key) throw (std::bad_alloc)
{
	tmp = key;
	if (hash[tmp] != NULL)
		return *hash[tmp];
	FileGroup *ptr = new FileGroup(key); // throws bad_alloc
	try
	{
		hash += *ptr; // throws bad_alloc
		return *ptr;
	}
	catch(std::bad_alloc &e)
	{
		delete ptr;
		throw(e);
	}
}

void SizeGroup::accept(SamefileVisitor &v)
{
	if (v.visit(*this))
		return;
	size_t n = hash.getBoundry();
	for (size_t i = 0; i < n; ++i)
		if (hash[i] != NULL)
			hash[i]->accept(v);
}

void SizeGroup::sort(int (&compare)(const void *a, const void *b),
	int (&compareFilename)(const void *a, const void *b)) throw()
{
	size_t n = hash.getSize();
	hash.convert(CONTAINER_VECTOR);
#ifdef DEBUG
	for (size_t i = 0; i < n; ++i)
		if (hash[i]->getSize() == 0)
		{
			fprintf(stderr, "%s:%d FileGroup child doesn't have any File(s)\n",
				__FILE__, __LINE__);
			exit(EXIT_FAILURE);
		}
#endif // DEBUG
	for (size_t i = 0; i < n; ++i)
		hash[i]->sort(compareFilename);
#ifdef DEBUG
	for (size_t i = 0; i < n; ++i)
		if (hash[i]->getSize() == 0)
		{
			fprintf(stderr, "%s:%d FileGroup child doesn't have any File(s)\n",
				__FILE__, __LINE__);
			exit(EXIT_FAILURE);
		}
#endif // DEBUG
	hash.sort(compare);
}

#ifdef WITH_DISK_STORAGE
size_t SizeGroup::diskWrite(Storage &storage) throw()
{
	accept(storage);
	this->storage = &storage;

	size_t n = hash.getBoundry();
	for (size_t i = 0; i < n; ++i)
		if (hash[i] != NULL)
			hash[i]->empty();
	return storage.close();
}

int SizeGroup::diskRead(
	int (&addingAllowed)(const char *, const FileGroup &))
	throw (std::bad_alloc)
{
	if (storage == NULL)
		return 0;
	if (storage->open(fileSize) < 0)
		return -1;
	const char *path;
	while((path = storage->read(tmp.device, tmp.inode)) != NULL)
		if (addingAllowed(path, *hash[tmp]))
			*hash[tmp] += path; // throws bad_alloc
	size_t counter = storage->close();
	storage->clean();
	return counter;
}
#endif // WITH_DISK_STORAGE

/* ************************************************************************ */

inline void actOnResult(SizeGroup &local, MatchMatrix &match,
	int (&f)(const SizeGroup &, const FileGroup &, const Filename &, 
		const FileGroup &, const Filename &, int),
	size_t fileSize, size_t &nIdenticals, 
	int flags, size_t &i, size_t &j, size_t &n,
	FileGroup &left, FileGroup &right)
{
	int result = match.get(i, j);
	Iterator<Filename> &leftIterator = *left.createIterator();
	Iterator<Filename> &rightIterator = *right.createIterator();
#ifdef DEBUG
	if (!left.getSize())
	{
		fprintf(stderr, "%s:%d left filegroup is empty\n",
			__FILE__, __LINE__);
		exit(EXIT_FAILURE);
	}
	if (!right.getSize())
	{
		fprintf(stderr, "%s:%d right filegroup is empty\n",
			__FILE__, __LINE__);
		exit(EXIT_FAILURE);
	}
#endif // DEBUG

#ifndef READ_ONLY_ONCES
	// Do a physically check if we couldn't use logic
	if (!result)
	{
		result = left.fcmp(right, fileSize);
		if (result > 0 && result & FILE_IDENTICAL)
			++nIdenticals;
		else if (result < 0)
			f(local, left, *leftIterator.getItem(),
				right, *rightIterator.getItem(), result);
		match.set(i, j, result);
	}
#endif // READ_ONLY_ONCES

	switch(result)
	{
		case FILE_IDENTICAL | FILE_BY_LOGIC:
		case FILE_DIFFERENT | FILE_BY_LOGIC:
		case FILE_IDENTICAL:
		case FILE_DIFFERENT:
		{
			if ((flags & FILE_BY_LOGIC) && (result & FILE_BY_LOGIC) ||
				!(result & FILE_BY_LOGIC))
			{
				int skip = 0;
				for(; !leftIterator.end(); ++leftIterator)
				{
					if (leftIterator.getItem() == NULL)
						continue;
					for (rightIterator.reset(); !rightIterator.end(); ++rightIterator)
						if (rightIterator.getItem() != NULL &&
							f(local, left, *leftIterator.getItem(),
							right, *rightIterator.getItem(), result))
							skip = 1;
					if (skip == 1)
						break;
				}
			}
#ifndef READ_ONLY_ONCES
			// a==b (00) a==c (01) => b==c (10)  | rowSize = 10
			// a!=d (02) => b!=d (12) c!=d (22)
			for (size_t k = i + 1; k <= j; ++k)
				if (j != k)
					switch(result | match.get(i, k))
					{
						case FILE_IDENTICAL:
							match.set(k, j,
								FILE_IDENTICAL | FILE_BY_LOGIC);
						break;
						case FILE_IDENTICAL | FILE_DIFFERENT:
							match.set(k, j,
								FILE_DIFFERENT | FILE_BY_LOGIC);
						break;
					}
#endif // READ_ONLY_ONCES
		}
		break;

#ifndef READ_ONLY_ONCES
		case FILE_OPEN1_ERROR:
		case FILE_READ1_ERROR:
				j = n; 
		break;

		case FILE_OPEN2_ERROR:
		case FILE_READ2_ERROR:
			for (size_t k = i + 1; k < j; ++k)
				match.set(k, j, result);
		break;
#endif // READ_ONLY_ONCES
	}

	delete &leftIterator;
	delete &rightIterator;
}

size_t SizeGroup::compareFiles(MatchMatrix &match,
	int (&f)(const SizeGroup &, const FileGroup &, const Filename &, 
		const FileGroup &, const Filename &, int),
	int flags,
	int (*preCheck)(const SizeGroup &, const FileGroup &, const FileGroup &)
	) throw (std::bad_alloc)
{
	size_t n = hash.getSize();
	if (n < 2)
		return 0;
	size_t nIdenticals = 0;

	// Make sure the container is a vector
	hash.convert(CONTAINER_VECTOR);

	/* If READ_ONLY_ONCES is defined the code will read each file only ones.
	 * If READ_ONLY_ONCES is not defined the code will use less memory.
	 */
#ifdef READ_ONLY_ONCES
	// Open files
	buffer.setN(n);
	for (size_t i = 0; i < n; ++i)
	{
#ifdef DEBUG
		if (hash[i] == NULL)
		{
			fprintf(stderr, "%s:%d filegroup doesn't exist\n",
				__FILE__, __LINE__);
			exit(EXIT_FAILURE);
		}
		if (!hash[i]->getSize())
		{
			fprintf(stderr, "%s:%d filegroup is empty\n",
				__FILE__, __LINE__);
			exit(EXIT_FAILURE);
		}
#endif // DEBUG
		// Check so we can save on hard disk access.
		for (size_t j = i + 1; j < n; ++j)
			if (preCheck != NULL && preCheck(*this, *hash[i], *hash[j]))
			{
				match.set(i, j, FILE_DIFFERENT);
				match.increaseEqual(i);
				match.increaseEqual(j);
			}

		// Open files
		if (match.getEqual(i) < n -1 &&
			match.setFd(i, hash[i]->open(O_RDONLY)) < 0)
			match.set(i, n-1, FILE_OPEN1_ERROR);

	}

	// Compare the files
	size_t size = buffer.getPageSize();
	for (off_t offset = 0; offset < fileSize; offset += size)
	{
		if (offset + size > fileSize)
			size = fileSize - offset;

		// read pages
		for (size_t i = 0; i < n; ++i)
			if (match.get(i, n-1) >= 0 && match.getEqual(i) < n - 1)
				if (USE_MMAP(fileSize))
				{
					void *mm = mmap(0, size, PROT_READ, MAP_SHARED, match.getFd(i), offset);
					if (mm == MAP_FAILED)
						match.set(i, n-1, FILE_READ1_ERROR);
					match.setMm(i, mm);
					posix_madvise(mm, size, POSIX_MADV_WILLNEED);
				}
				else if (read(match.getFd(i), buffer[i], size) < 0)
					match.set(i, n-1, FILE_READ1_ERROR);

		// compare pages
		for (size_t i = 0; i < n; ++i)
		{
			if (match.getEqual(i) == n - 1 || match.get(i, n-1) < 0)
				continue;
			for (size_t j = i + 1; j < n; ++j)
				if (match.getEqual(j) == n - 1 || match.get(j, n-1) < 0 ||
					match.get(i, j) == FILE_DIFFERENT)
					continue;
				else if (USE_MMAP(fileSize))
				{
					if (memcmp(match.getMm(i), match.getMm(j), size) != 0)
					{
							match.set(i, j, FILE_DIFFERENT);
							match.increaseEqual(i);
							match.increaseEqual(j);
					}
				}
				else if (memcmp(buffer[i], buffer[j], size) != 0)
				{
					match.set(i, j, FILE_DIFFERENT);
					match.increaseEqual(i);
					match.increaseEqual(j);
				}
		}

		// unread?
		for (size_t i = 0; i < n; ++i)
			if (match.getMm(i) != NULL)
			{
				posix_madvise(match.getMm(i), size, POSIX_MADV_NORMAL);
				munmap(match.getMm(i), size);
				match.setMm(i, NULL);
			}
	}

	// close files
	for (size_t i = 0; i < n; ++i)
		if (!match.getFd(i) != 0)
			close(match.getFd(i));

	// mark identicals
	for (size_t i = 0; i < n; ++i)
		for (size_t j = i + 1; j < n; ++j)
			if (match.get(i, j) == 0)
			{
				++nIdenticals;
				match.set(i, j, FILE_IDENTICAL);
			}

	// mark found by logic
	int result;
	for (size_t i = 0; i < n; ++i)
		for (size_t j = i + 1; j < n; ++j)
		{
			result = match.get(i, j);
			// (0, 1), (0, 2) (1, 2) i = 1 j =2
			for (size_t k = i + 1; k < j; ++k)
				switch(result | match.get(i, k))
				{
					case FILE_IDENTICAL:
						match.set(k, j,
							FILE_IDENTICAL | FILE_BY_LOGIC);
					break;

					case FILE_IDENTICAL | FILE_DIFFERENT:
						match.set(k, j,
							FILE_DIFFERENT | FILE_BY_LOGIC);
					break;
				}
		}

	// do something with the result
	for (size_t i = 0; i < n; ++i)
	{
#ifdef DEBUG
		if (hash[i] == NULL)
		{
			fprintf(stderr, "%s:%d left filegroup doesn't exist\n",
				__FILE__, __LINE__);
			exit(EXIT_FAILURE);
		}
#endif // DEBUG
		for (size_t j = i + 1; j < n; ++j)
		{
#ifdef DEBUG
			if (hash[j] == NULL)
			{
				fprintf(stderr, "%s:%d right filegroup doesn't exist\n",
					__FILE__, __LINE__);
				exit(EXIT_FAILURE);
			}
#endif // DEBUG

			// do something with the result.
			actOnResult(*this, match, f, fileSize, nIdenticals,
				flags, i, j, n, *hash[i], *hash[j]);
		}
	}
#else // READ_ONLY_ONCES
	// Compare the files.
	for (size_t i = 0; i < n; ++i)
	{
#ifdef DEBUG
		if (hash[i] == NULL)
		{
			fprintf(stderr, "%s:%d left filegroup doesn't exist\n",
				__FILE__, __LINE__);
			exit(EXIT_FAILURE);
		}
#endif // DEBUG
		for (size_t j = i + 1; j < n; ++j)
		{
#ifdef DEBUG
			if (hash[j] == NULL)
			{
				fprintf(stderr, "%s:%d right filegroup doesn't exist\n",
					__FILE__, __LINE__);
				exit(EXIT_FAILURE);
			}
#endif // DEBUG

			// Check so we can save on hard disk access.
			if (preCheck != NULL && preCheck(*this, *hash[i], *hash[j]))
				continue;

			// do something with the result.
			actOnResult(*this, match, f, fileSize, nIdenticals,
				flags, i, j, n, *hash[i], *hash[j]);
		}
	}
#endif // READ_ONLY_ONCES
	match.reset(n);
	return nIdenticals;
}

