
/* ************************************************************************ *
 *                         This is samefile driver.                         *
 * ************************************************************************ *
 *              Written by Alex de Kruijff                2009              *
 * ************************************************************************ *
 * This source was written with a tabstop every four characters             *
 * In vi type :set ts=4                                                     *
 * ************************************************************************ */

#include "configure.h"
#include "toolkit.h"
#include "storage.h"
#include "write2disk.h"
#include "printhardlinked.h"
#include "stats.h"
#include "holder.h"
#include "main.h"

#ifdef HAVE_LIMITS_H
#include <limits.h>
#endif // HAVE_LIMITS_H
#include <stdio.h>
#ifdef HAVE_STRING_H
#include <string.h>
#endif // HAVE_STRING_H
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif // HAVE_STDLIB_H
#if defined(HAVE_SYS_TIME_H) && defined(HAVE_GETTIMEOFDAY)
#include <sys/time.h>
#endif // HAVE_SYS_TIME_H && HAVE_GETTIMEOFDAY
#include <time.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif // HAVE_UNISTD_H

#include <new>

#define VERBOSE_LEVEL1		1
#define VERBOSE_LEVEL2		2
#define VERBOSE_LEVEL3		3
#define VERBOSE_MAX			3
#define VERBOSE_MASK		3
#define ADD_HARDLINKED		4
#define FULL_LIST			8
#define HUMAN_READABLE		16
#define MATCH_LEFT			32
#define MATCH_RIGHT			64
#define MATCH_TIME			128
#define REPORT_HARDLINKS	256
#define SKIP_SORT		512

#define MATCH_AUTO			(MATCH_LEFT | MATCH_RIGHT)
#define MATCH_MASK			(MATCH_LEFT | MATCH_RIGHT | MATCH_TIME)

#define S_ADD_HARDLINKED(m)		((m) & ADD_HARDLINKED)
#define S_FULL_LIST(m)			((m) & FULL_LIST)
#define S_HUMAN_READABLE(m)		((m) & HUMAN_READABLE)
#define S_MATCH(m)				((m) & (MATCH_LEFT | MATCH_RIGHT))
#define S_MATCH_LEFT(m)			((m) & MATCH_LEFT)
#define S_MATCH_RIGHT(m)		((m) & MATCH_RIGHT)
#define S_MATCH_TIME(m)			((m) & MATCH_TIME)
#define S_MATCH_MASK(m)			((m) & MATCH_MASK)
#define S_REPORT_HARDLINKS(m)	((m) & REPORT_HARDLINKS)
#define S_SKIP_SORT(m)			((m) & SKIP_SORT)
#define S_VERBOSE(m)			((m) & VERBOSE_MASK)
#define S_VERBOSE_LEVEL1(m)		(((m) & VERBOSE_MASK) >= VERBOSE_LEVEL1)
#define S_VERBOSE_LEVEL2(m)		(((m) & VERBOSE_MASK) >= VERBOSE_LEVEL2)
#define S_VERBOSE_LEVEL3(m)		(((m) & VERBOSE_MASK) >= VERBOSE_LEVEL3)

// Retrieved from processOptions
static const char *program = NULL;
static const char *sep = "\t";
static off_t minSize = 0, maxSize = UINT_MAX;
unsigned flags = VERBOSE_LEVEL1 | MATCH_LEFT;
static int eol = '\n';

// Retrieved from processInput
#ifdef DEBUG
static struct timeval time0, time1, time2, time3, time4;
#else
static struct timeval time0, time2, time3, time4;
#endif

// Retrieved from deleteEarly
static time_t deleteEarly_t = 0;

static ulongest_t waisted;

int processOptions(int argc, char **argv, void (&usage)(const char *)) throw()
{
	(program = rindex(argv[0], '/')) ? ++program : program = argv[0];

	int c;
	while((c = getopt(argc, argv, "h?g:m:s:aH0xALZtilrqVv")) != -1)
		switch(c)
		{
			default: case 'h': case '?': usage(program);				break;

			case 'g':
				if (sscanf(optarg, "%ji",  &minSize) !=  1)
					minSize = 0, fprintf(stderr,
						"warning: can't convert -g %s, using -g 0 instead\n",
						optarg);
				break;
			case 'm':
				if (sscanf(optarg, "%ji",  &maxSize) !=  1)
					maxSize = 0, fprintf(stderr,
						"warning: can't convert -m %s, using -g 0 instead\n",
						optarg);
				break;

			case 's':
				fprintf(stderr, "this option is obsolite use -S instead\n");
			case 'S': sep = optarg; 									break;
			case 'H': flags |= HUMAN_READABLE; 							break;
			case '0': eol = 0; 											break;

			case 'a': flags &= ~MATCH_MASK;		 						break;
			case 'A': flags &= ~MATCH_RIGHT;	flags |= MATCH_LEFT;	break;
			case 'L': flags &= ~MATCH_MASK; 	flags |= MATCH_AUTO; 	break;
			case 'Z': flags &= ~MATCH_LEFT;		flags |= MATCH_RIGHT;  	break; 	
#ifndef LOW_MEMEMORY_PROFILE
			case 't': 							flags |= MATCH_TIME; 	break;
#endif // LOW_MEMEMORY_PROFILE
			case 'x': 						flags |= FULL_LIST; 		break;
			case 'i':						flags |= ADD_HARDLINKED;	break;
			case 'r':						flags |= REPORT_HARDLINKS;	break;
			case 'l': flags &= ~REPORT_HARDLINKS;						break;

			case 'q': flags &= ~VERBOSE_MASK; 							break;
			case 'V':
				printf(COPYRIGHT, PACKAGE_STRING, program);
				exit(EXIT_SUCCESS);
			break;
			case 'v': if ((S_VERBOSE(flags)) < VERBOSE_MAX) ++flags; 	break;
		}
	return optind;
}

#ifdef WITH_DISK_STORAGE
static void
solveMemoryProblem(Holder &holder, Write2Disk &write2Disk, size_t flags)
#else // WITH_DISK_STORAGE
static void solveMemoryProblem(Holder &holder, size_t flags)
#endif // WITH_DISK_STORAGE
{
	Stats stats;
	holder.accept(stats);
	size_t max = stats.getMaxFileSize();

#ifdef WITH_DISK_STORAGE
	size_t maxDiskFileSize = 0;
	size_t min = stats.getMinFileSize();
	size_t avg = stats.getFilenames() ? stats.getTotalSize() / stats.getFilenames() : 0;

	// would give problems with sorting.
	switch(S_MATCH_MASK(::flags))
	{
		case MATCH_LEFT:								// -A
		case MATCH_RIGHT:								// -Z
			write2Disk.reset(0);
			flags &= ~1;
	}

	// try to write to disk
	if (flags & 1)
	{
		do
		{
			if (maxDiskFileSize < avg / 2 + min / 2)
				maxDiskFileSize = avg / 2 + min / 2;
			else if (maxDiskFileSize < avg)
				maxDiskFileSize = avg;
			else
				maxDiskFileSize = max;
			holder.accept(write2Disk.reset(maxDiskFileSize));
		}
		while(maxDiskFileSize < max && write2Disk.done() <= 0);
		if (write2Disk.done())
		{
			fprintf(stderr, "memory full: written %u paths to disk\n",
				write2Disk.done());
			return;
		}
	}

#endif // WITH_DISK_STORAGE
	// try to remove from memory if we fail
	if (flags & 2)
	{
		unsigned long counter = 0;
		do
		{
			if (minSize == 0)
				minSize = 32;
			else if (minSize < max)
				minSize <<= 1;
			if (minSize >= max)
			{
				minSize >>= 1;
				size_t tmp = minSize >> 1;
				while(minSize + tmp >= max && tmp > 1)
					tmp >>= 1;
				minSize += tmp;
			}
			counter += holder.remove(0, minSize);
		}
		while(counter == 0 && minSize < max);
	}

	// abort if we fail
	if (minSize >= max)
	{
		fprintf(stderr,
			"memory full: aborting... to manny files with the same size.\n");
		exit(EXIT_FAILURE);
	}
	else if (S_VERBOSE_LEVEL1(flags))
		fprintf(stderr,
			"memory full: changed minimum file size to %zu\n",
			minSize);
}

#ifdef WITH_DISK_STORAGE
static void readInput(Holder &holder, Write2Disk &write2Disk) throw()
#else // WITH_DISK_STORAGE
static void readInput(Holder &holder) throw()
#endif // WITH_DISK_STORAGE
{
	size_t capacity = PATH_INIT;
	int continueRoutine = 1;
	char *pos = NULL, *path = new char[capacity]; // intentional
	struct stat s;
	do
	{
		pos = NULL;
		do
		{
			continueRoutine = 1;
			try
			{
				pos = fgetline(path, capacity, stdin, eol, pos);
				continueRoutine = 0;
			}
			catch(std::bad_alloc &e)
			{
				pos = path + ((pos == NULL) ? capacity : strlen(path));
#ifdef WITH_DISK_STORAGE
				solveMemoryProblem(holder, write2Disk, 3);
#else // WITH_DISK_STORAGE
				solveMemoryProblem(holder, 3);
#endif // WITH_DISK_STORAGE
			}
		}
		while(continueRoutine);

		do
		{
			continueRoutine = 1;
			try
			{
				// Skip file if its unlinkable, non-regular file,
				// to small or to big
				if (lstat(path, &s) < 0 || S_ISREG(s.st_mode) == 0 ||
					s.st_size <= minSize || maxSize && s.st_size > maxSize)
				{
					continueRoutine = 0;
					continue;
				}

				FileGroup &filegroup = holder[s][s];
				if (filegroup != path &&
					(S_ADD_HARDLINKED(flags) || filegroup.isEmpty()))
					filegroup += path;
//				pos = NULL;
				continueRoutine = 0;
			}
			catch(std::bad_alloc &e)
			{
#ifdef WITH_DISK_STORAGE
				solveMemoryProblem(holder, write2Disk, 3);
#else // WITH_DISK_STORAGE
				solveMemoryProblem(holder, 3);
#endif // WITH_DISK_STORAGE
			}
		}
		while(continueRoutine);
	}
	while(pos);
	delete[] path;
}

static int addingAllowed(const char *path, const FileGroup &obj)
{
	return obj != path && (S_ADD_HARDLINKED(flags) || obj.getSize() == 0);
}

static int deleteEarly(SizeGroup &obj, size_t i, size_t n)
{
	if (S_VERBOSE_LEVEL3(flags))
		if (time(NULL) >= deleteEarly_t)
		{
			// Skip the first time
			if (deleteEarly_t == 0)
			{
				deleteEarly_t = 60 + time(NULL);
				return 1;
			}

			// Print a info line
			fprintf(stderr, "\ninfo: FileSize ");
			fprintsize(stderr, obj.getFileSize());
			fprintf(stderr, " | %lu%% (%zu/%zu)\n",
				100 * (i + 1) / n, (i + 1), n);

			// Return here in 60s
			deleteEarly_t = 60 + time(NULL);
		}
		else // print a dot
			fprintf(stderr, ".");
	return 1;
}

void compareFiles(Stats &stats, Holder &holder,
	int (&printFileCompare)(const SizeGroup &,
		const FileGroup &, const Filename &,
		const FileGroup &, const Filename &,
		int result),
	int flags,
	int (*preCheck)(const SizeGroup &,
		const FileGroup &, const FileGroup &)
#ifdef WITH_DISK_STORAGE
	, Write2Disk &write2Disk
#endif // WITH_DISK_STORAGE
	) throw()
{
	int continueRoutine = 1;
	waisted = 0;
	do
	{
		try
		{
			waisted += holder.compareFiles(stats, printFileCompare,
				flags, addingAllowed,deleteEarly, preCheck);
			continueRoutine = 0;
		}
		catch(std::bad_alloc &e)
		{
#ifdef WITH_DISK_STORAGE
			solveMemoryProblem(holder, write2Disk, 3);
#else // WITH_DISK_STORAGE
			solveMemoryProblem(holder, 3);
#endif // WITH_DISK_STORAGE
		}
	} while(continueRoutine);
	if (S_VERBOSE_LEVEL3(flags))
		fprintf(stderr, ".\n");
}

void processInput(Stats &stats,
	int (&printFileCompare)(const SizeGroup &, const FileGroup &,
		const Filename &, const FileGroup &, const Filename &,
		int result),
	int (&printHard)(const char *a, const char *b, nlink_t nlink,
		off_t fileSize, const char *sep),
	int (&selectResults)(int flags, const char *sep),
	int (*preCheck)(const SizeGroup &,
		const FileGroup &, const FileGroup &)) throw()
{
#ifdef HAVE_GETTIMEOFDAY
	gettimeofday(&time0, (struct timezone *)NULL);
#endif // HAVE_GETTIMEOFDAY
	size_t oldMinSize = minSize;
	Holder holder;

	// Reserve memory for later (better memory management)
	char *dummy;
	{
		size_t dummySize = sizeof(char **);
#ifndef LOW_MEMORY_PROFILE
		dummySize += sizeof(int) + sizeof(size_t) + sizeof(void *);
#endif // LOW_MEMORY_PROFILE
		dummySize *= EXPECTED_MAX_GROUP;
		dummySize += (EXPECTED_MAX_GROUP + 1) * EXPECTED_MAX_GROUP / 2;
		dummy = new char[dummySize]; // 1 MB
	}

	// Stage1 - reading the input
	if (S_VERBOSE_LEVEL2(flags))
		fprintf(stderr, "info: reading input\n");
#ifdef WITH_DISK_STORAGE
	Storage storage(program);
	Write2Disk write2Disk(storage);
	readInput(holder, write2Disk);
#else // WITH_DISK_STORAGE
	readInput(holder);
#endif // WITH_DISK_STORAGE

#ifdef DEBUG
#ifdef HAVE_GETTIMEOFDAY
	gettimeofday(&time1, (struct timezone *)NULL);
#endif // HAVE_GETTIMEOFDAY
#endif

	// Stage2 - mandatory sorting
#ifdef DEBUG
	if (S_VERBOSE_LEVEL2(flags))
		fprintf(stderr, "info: sorting\n");
#endif
	switch(S_MATCH_MASK(flags))
	{
		case MATCH_LEFT:								// -A
			holder.sort(FileGroup::compareFirst, Filename::compareFirst);
			break;
		case MATCH_RIGHT:								// -Z
			holder.sort(FileGroup::compareLast, Filename::compareLast);
			break;
#ifndef LOW_MEMORY_PROFILE
		case MATCH_LEFT | MATCH_TIME:					// -At
			holder.sort(FileGroup::compareOldest, Filename::compareFirst);
			break;
		case MATCH_RIGHT | MATCH_TIME:					// -Zt
			holder.sort(FileGroup::compareYoungest, Filename::compareLast);
			break;
#endif // LOW_MEMORY_PROFILE
		case MATCH_LEFT | MATCH_RIGHT:					// -L
			holder.sort(FileGroup::compareNlink, Filename::compareFirst);
	}
#ifdef HAVE_GETTIMEOFDAY
	gettimeofday(&time2, (struct timezone *)NULL);
#endif // HAVE_GETTIMEOFDAY

	// Stage3 - print hard linked files
	if (S_REPORT_HARDLINKS(flags))
	{
		if (S_VERBOSE_LEVEL2(flags))
			fprintf(stderr, "info: hard linked filenames\n");
		PrintHardLinked printHardLinked(printHard, sep);
		holder.accept(printHardLinked);
#ifdef HAVE_GETTIMEOFDAY
		gettimeofday(&time3, (struct timezone *)NULL);
#endif // HAVE_GETTIMEOFDAY
	}
	else
		time3 = time2;

	// Release dummy for the memory needs in stage4
	delete[] dummy;

	// Stage4 - checkfiles & print identical
	if (S_VERBOSE_LEVEL2(flags))
		fprintf(stderr, "info: comparing files\n");
	compareFiles(stats, holder, printFileCompare,
		selectResults(flags, sep),
#ifdef WITH_DISK_STORAGE
		preCheck, write2Disk);
#else // WITH_DISK_STORAGE
		preCheck);
#endif // WITH_DISK_STORAGE
#ifdef HAVE_GETTIMEOFDAY
	gettimeofday(&time4, (struct timezone *)NULL);
#endif // HAVE_GETTIMEOFDAY

	if (minSize != oldMinSize && S_VERBOSE_LEVEL1(flags))
	{
		fprintf(stderr,
			"Changed mimimum file size from %zu to %zu.\n",
			oldMinSize, minSize);
		fprintf(stderr, "Continue with: ... | %s -g %zu -m %zu\n",
			program, oldMinSize, minSize);
	}
}

void processStats(Stats &stats) throw()
{
	size_t avg = stats.getFilenames() ? stats.getTotalSize() / stats.getFilenames() : 0;
	int percentage = stats.getTotalSize() ? 100 * waisted / stats.getTotalSize() : 0;
	int ndigits = digits(stats.getTotalSize());

	fprintf(stderr, "info: stats");
	fprintf(stderr, "\nNumber of i-nodes...........: ");
	if (S_HUMAN_READABLE(flags))	fprintsize(stderr, stats.getFiles());
	else
	{
		for(int i = digits(avg); i < ndigits; ++i)
			fprintf(stderr, " ");
		fprintf(stderr, "%lu", stats.getFiles());
	}

	fprintf(stderr, "\nNumber of filenames.........: ");
	if (S_HUMAN_READABLE(flags))	fprintsize(stderr, stats.getFilenames());
	else
	{
		for(int i = digits(avg); i < ndigits; ++i)
			fprintf(stderr, " ");
		fprintf(stderr, "%lu", stats.getFilenames());
	}


	fprintf(stderr, "\nAverage file size...........: ");
	if (S_HUMAN_READABLE(flags))	fprintsize(stderr, avg);
	else
	{
		for(int i = digits(avg); i < ndigits; ++i)
			fprintf(stderr, " ");
		fprintf(stderr, "%lu", avg);
	}

	fprintf(stderr, "\nGrand total file size.......: ");
	if (S_HUMAN_READABLE(flags))	fprintsize(stderr, stats.getTotalSize());
#ifdef __LONG_LONG_SUPPORTED
	else							fprintf(stderr, "%llu", stats.getTotalSize());
#else // __LONG_LONG_SUPPORTED
	else							fprintf(stderr, "%lu", stats.getTotalSize());
#endif // __LONG_LONG_SUPPORTED

	fprintf(stderr, "\nWaisted disk space..........: ");
	if (S_HUMAN_READABLE(flags))	fprintsize(stderr, waisted);
	else
	{
		for(int i = digits(waisted); i < ndigits; ++i)
			fprintf(stderr, " ");
#ifdef __LONG_LONG_SUPPORTED
		fprintf(stderr, "%llu", waisted);
#else // __LONG_LONG_SUPPORTED
		fprintf(stderr, "%lu", waisted);
#endif // __LONG_LONG_SUPPORTED
	}
	fprintf(stderr, " (%u%%)", percentage);

	fprintf(stderr, "\nExecution time:");
#ifdef HAVE_GETTIMEOFDAY
#ifdef DEBUG
	if (time1.tv_sec !=  time0.tv_sec || time1.tv_usec !=  time0.tv_usec)
	{
		fprintf(stderr, "\n    reading input...........:");
		fprinttime(stderr, time1, time0, S_HUMAN_READABLE(flags));
	}
	if (time2.tv_sec !=  time1.tv_sec || time2.tv_usec !=  time1.tv_usec)
	{
		fprintf(stderr, "\n    sorting.................:");
		fprinttime(stderr, time2, time1, S_HUMAN_READABLE(flags));
	}
#else
	if (time2.tv_sec !=  time0.tv_sec || time2.tv_usec !=  time0.tv_usec)
	{
		fprintf(stderr, "\n    reading input...........:");
		fprinttime(stderr, time2, time0, S_HUMAN_READABLE(flags));
	}
#endif
	if (time3.tv_sec !=  time2.tv_sec || time3.tv_usec !=  time2.tv_usec)
	{
		fprintf(stderr, "\n    report hard linked files:");
		fprinttime(stderr, time3, time2, S_HUMAN_READABLE(flags));
	}
	if (time4.tv_sec !=  time3.tv_sec || time4.tv_usec !=  time3.tv_usec)
	{
		fprintf(stderr, "\n    report identical files..:");
		fprinttime(stderr, time4, time3, S_HUMAN_READABLE(flags));
	}
	if (time4.tv_sec !=  time0.tv_sec || time4.tv_usec !=  time0.tv_usec)
	{
		fprintf(stderr, "\n                              --------+");
		fprintf(stderr, "\n        Total execution time:");
			fprinttime(stderr, time4, time0, S_HUMAN_READABLE(flags));
		fprintf(stderr, "\n");
	}
	else
		fprintf(stderr, "0s\n");
#endif // HAVE_GETTIMEOFDAY
}

