/********************************************************************************
Class WSHTMLTokenizer
********************************************************************************/
function WSHTMLTokenizer() {
}
//---------------------------------------------------------------------------------
WSHTMLTokenizer.prototype.isWhiteSpace = function(ch) {
	return (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n');
}
//---------------------------------------------------------------------------------
WSHTMLTokenizer.prototype.isAlpha = function(ch) {
	return ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'));
}
//---------------------------------------------------------------------------------
WSHTMLTokenizer.prototype.isDigit = function(ch) {
	return (ch >= '0' && ch <= '9');
}
//---------------------------------------------------------------------------------
WSHTMLTokenizer.prototype.isAlnum = function(ch) {
	return (this.isAlpha(ch) || this.isDigit(ch));
}
//---------------------------------------------------------------------------------
WSHTMLTokenizer.prototype.isPunct = function(ch) {
	return (ch == ',' || ch == '.' || ch == ':' || ch == ';' || ch == '!' || ch == '?');
}
//---------------------------------------------------------------------------------
WSHTMLTokenizer.prototype.isMiddleChar = function(sText, nPos) {
	return (nPos+1<sText.length && nPos-1>=0 && this.isAlnum(sText.charAt(nPos + 1)) && this.isAlnum(sText.charAt(nPos - 1)));
}
//---------------------------------------------------------------------------------
WSHTMLTokenizer.prototype.isLegalChar = function(sText, nPos) {
	var ch = sText.charAt(nPos);
	return (this.isAlnum(ch) || ch == '_' || (ch == '\'' && this.isMiddleChar(sText, nPos)));
}
//---------------------------------------------------------------------------------
WSHTMLTokenizer.prototype.isInternetAddress = function(sWord) {
	if (sWord.length == 0) return false;
	var i = 0;
	if (sWord.indexOf("www.") == 0) {
		return (sWord.length > 4);
	} else if (sWord.indexOf("http://") == 0) {
		return (sWord.length > 7);
	} else if ((i = sWord.indexOf('@')) > 0) {
		var nDotPos = sWord.indexOf('.', i);
		return (nDotPos >= 0);
	} else return false;
}
//---------------------------------------------------------------------------------
WSHTMLTokenizer.prototype.isAbbreviation = function(sWord) {
	if (sWord.length < 2) return false;
	var bDotNow = false;
	for (var i = 0; i<sWord.length-1; ++i) {
		if (bDotNow) {
			if (sWord.charAt(i) != '.') return false;
		} else {
			if (!this.isAlpha(sWord.charAt(i))) return false;
		}
		bDotNow = !bDotNow;
	}
	var last_char = sWord.charAt(sWord.length - 1);
	return (! (bDotNow && this.isAlpha(last_char)));
}
//---------------------------------------------------------------------------------
WSHTMLTokenizer.prototype.isNumber = function(sWord) {
	if (sWord.length == 0) return false;
	var state_comma = false,
	state_dot = true,
	seen_dot = false;
	for (var i=0; i<sWord.length; ++i) {
		var ch = sWord.charAt(i);
		switch (ch) {
		case '-':
		case '+':
		case '*':
		case '/':
		case '=':
			break;
		case '.':
			{
				if (!state_dot) return false;
				seen_dot = true;
				state_comma = false;
			}
			break;
		case ',':
			{
				if (!state_comma) return false;
				state_comma = false;
			}
			break;
		case 'x':
		case 'X':
			{
				if (i > 0 && sWord.charAt(i - 1) != '0') return false;
			}
			break;
		case 'E':
		case 'e':
			{
				if (i <= length - 2 && !(sWord.charAt(i + 1) == '+' || sWord.charAt(i + 1) == '-')) return false;
			}
			break;
		default:
			{
				if (!this.isDigit(ch)) return false;
				state_comma = true;
			}
		}
	}
	return true;
}
//---------------------------------------------------------------------------------
WSHTMLTokenizer.prototype.isWholeWord = function(sWord) {
	return (this.isAbbreviation(sWord) || this.isNumber(sWord) || this.isInternetAddress(sWord));
}
//---------------------------------------------------------------------------------
WSHTMLTokenizer.prototype.addToken = function(vTokens, sToken, nPos) {
	if (sToken.length > 0) {
		vTokens[vTokens.length] = {
			sWord: sToken,
			nPos: nPos,
			nLength: sToken.length
		};
	}
	return parseInt(nPos + sToken.length);
}
//---------------------------------------------------------------------------------
WSHTMLTokenizer.prototype.extractWords = function(vTokens, sToken, nPos) {
	if (this.isAbbreviation(sToken) || this.isNumber(sToken) || this.isInternetAddress(sToken)) {
		var last_char = sToken.charAt(sToken.length - 1);
		if (this.isPunct(last_char)) {
			var w1 = sToken.substr(0, sToken.length - 1);
			var w2 = last_char;
			nPos = this.addToken(vTokens, w1, nPos);
			nPos = this.addToken(vTokens, w2, nPos);
		} else {
			nPos = this.addToken(vTokens, sToken, nPos);
		}
	} else {
		var sWord = "";
		for (var i = 0; i < sToken.length; ++i) {
			var ch = sToken.charAt(i);
			if (this.isLegalChar(sToken, i)) {
				sWord += ch;
			} else {
				nPos = this.addToken(vTokens, sWord, nPos);
				sWord = "";
				nPos = this.addToken(vTokens, ch, nPos);
				var sSuffix = sToken.substr(i + 1, sToken.length - i - 1);
				if (sSuffix.length > 0) {
					this.extractWords(vTokens, sSuffix, nPos);
				}
				return;
			}
		}
		nPos = this.addToken(vTokens, sWord, nPos);
	}
}
//---------------------------------------------------------------------------------
WSHTMLTokenizer.prototype.skipWhitespaces = function(sText, nPos) {
	while (nPos < sText.length && this.isWhiteSpace(sText.charAt(nPos))) {
		++nPos;
	}
	return nPos;
}
//---------------------------------------------------------------------------------
WSHTMLTokenizer.prototype.getToken = function(sText, nPos) {
	var sWord = "";
	for (var i = nPos; i < sText.length; ++i) {
		var ch = sText.charAt(i);
		if (ch == '\n' || ch == '\"') {
			if (sWord.length == 0) sWord += ch;
			else--i;
			break;
		} else if (this.isWhiteSpace(ch)) {
			if (sWord.length > 0) break;
		} else {
			sWord += ch;
		}
	}
	nPos = i + 1;
	nPos = this.skipWhitespaces(sText, nPos);
	return {
		sWord: sWord,
		nPos: nPos
	};
}
//---------------------------------------------------------------------------------
WSHTMLTokenizer.prototype.tokenizeText = function(vTokens, sText, nOffset) {
	var nPos = 0;
	nPos = this.skipWhitespaces(sText, nPos);
	for (var oToken = this.getToken(sText, nPos); oToken.sWord.length > 0; oToken = this.getToken(sText, nPos)) {
		this.extractWords(vTokens, oToken.sWord, nPos + nOffset);
		nPos = oToken.nPos;
	}
}
//---------------------------------------------------------------------------------
WSHTMLTokenizer.prototype.parseText = function(sText) {
	var vTokens = new Array();
	this.tokenizeText(vTokens, sText, 0);
	return vTokens;
}
//---------------------------------------------------------------------------------
WSHTMLTokenizer.prototype.getStartTextPosition = function(strContent, nCurrentPos) {
	var nIndex = nCurrentPos;
	nIndex = this.skipWhitespaces(strContent, nCurrentPos);
	while (nIndex >= 0 && nIndex < strContent.length && strContent.charAt(nIndex) == '<') {
		if (strContent.substr(nIndex, 7) == "<script") {
			nIndex = strContent.indexOf("</script", nIndex);
		} else if (strContent.substr(nIndex, 4) == "<!--") {
			nIndex = strContent.indexOf("-->", nIndex);
		}
		if (nIndex >= 0) {
			nIndex = strContent.indexOf(">", nIndex);
			if (nIndex >= 0) {++nIndex;
			}
		}
		nIndex = this.skipWhitespaces(strContent, nIndex);
	}
	return nIndex;
}
//---------------------------------------------------------------------------------
WSHTMLTokenizer.prototype.getStartTagPosition = function(strContent, nCurrentPos) {
	var nIndex = strContent.indexOf('<', nCurrentPos);
	return nIndex;
}
//---------------------------------------------------------------------------------
WSHTMLTokenizer.prototype.getBodyPosition = function(strContent) {
	var nBodyIndex = strContent.indexOf("<body");
	if (nBodyIndex >= 0) {
		nBodyIndex = this.getStartTextPosition(strContent, nBodyIndex);
	} else {
		nBodyIndex = 0;
	}
	return nBodyIndex;
}
//---------------------------------------------------------------------------------
WSHTMLTokenizer.prototype.findStartTokenIndex = function(strContent, nCurrentPos) {
	for (var nStart = nCurrentPos; nStart < strContent.length && (strContent.charAt(nStart) == '&' || strContent.charAt(nStart) == ' '); ++nStart) {}
	if (nStart == nCurrentPos || nStart >= strContent.length) {
		return nStart;
	} else if (strContent.charAt(nStart - 1) == '&') {
		var nFinish = this.findEndEscTokenIndex(strContent, nStart);
		if (nFinish >= 0 && nFinish < strContent.length) {++nFinish;
			nStart = this.findStartTokenIndex(strContent, nFinish);
		}
	}
	return nStart;
}
//---------------------------------------------------------------------------------
WSHTMLTokenizer.prototype.findEndEscTokenIndex = function(strContent, nCurrentPos) {
	var nFinish = 0;
	for (nFinish = nCurrentPos; nFinish < strContent.length && nFinish < nCurrentPos + 5 && strContent.charAt(nFinish) != ';'; ++nFinish) {}
	if (nFinish >= 0 && nFinish < strContent.length && strContent.charAt(nFinish) != ';') {
		nFinish = -1;
	}
	return nFinish;
}
//---------------------------------------------------------------------------------
WSHTMLTokenizer.prototype.findEndTokenIndex = function(strContent, nCurrentPos) {
	for (var nFinish = nCurrentPos; nFinish < strContent.length && strContent.charAt(nFinish) != ' ' && strContent.charAt(nFinish) != ';' && strContent.charAt(nFinish) != '&'; ++nFinish) {}
	if (nFinish >= 0 && nFinish < strContent.length && strContent.charAt(nFinish) != ';') {--nFinish;
	}
	return nFinish;
}
//---------------------------------------------------------------------------------
WSHTMLTokenizer.prototype.getHTMLToken = function(strContent, nCurrentPos) {
	var nStart, nFinish;
	var sToken = "";
	for (; nCurrentPos < strContent.length; ++nCurrentPos) {
		nStart = this.findStartTokenIndex(strContent, nCurrentPos);
		if (nStart < 0 || nStart >= strContent.length) {
			nCurrentPos = strContent.length;
			break;
		}
		nFinish = this.findEndTokenIndex(strContent, nStart);
		if (nFinish < 0 || nFinish >= strContent.length) {
			nFinish = strContent.length - 1;
		}
		sToken = strContent.substr(nStart, nFinish - nStart + 1);
		nCurrentPos = nFinish;
		if (sToken.length > 0) {
			break;
		}
	}
	return {
		sWord: sToken,
		nStart: nStart,
		nFinish: nFinish
	};
}
//---------------------------------------------------------------------------------
WSHTMLTokenizer.prototype.tokenizeHTML = function(vTokens, strSentence, nOffset) {
	var nPos = 0;
	for (var oToken = this.getHTMLToken(strSentence, nPos); oToken.sWord.length > 0; oToken = this.getHTMLToken(strSentence, nPos)) {
		this.tokenizeText(vTokens, oToken.sWord, nOffset + oToken.nStart);
		nPos = oToken.nFinish + 1;
	}
}
//---------------------------------------------------------------------------------
WSHTMLTokenizer.prototype.parseHTML = function(strOriginalContent) {
	var strContent = strOriginalContent.toLowerCase();
	var nStartText = 0,
	nEndText = 0;
	var vTokens = new Array();
	nStartText = this.getBodyPosition(strContent);
	for (; nStartText >= 0 && nStartText < strContent.length; nStartText = nEndText) {
		nStartText = this.getStartTextPosition(strContent, nStartText);
		if (nStartText < 0) {
			break;
		}
		nEndText = this.getStartTagPosition(strContent, nStartText);
		if (nEndText < 0 || nEndText >= strContent.length) {
			nEndText = strContent.length;
		}
		var strSentence = strOriginalContent.substr(nStartText, nEndText - nStartText);
		this.tokenizeHTML(vTokens, strSentence, nStartText);
	}
	return vTokens;
}
//---------------------------------------------------------------------------------
