package net.psammead.mwapi.scrapper;

import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import net.psammead.mwapi.config.ConfigInfo;
import net.psammead.mwapi.connection.TitleUtil;
import net.psammead.mwapi.net.IllegalFormException;
import net.psammead.mwapi.net.JerichoUtil;
import net.psammead.mwapi.ui.UnsupportedURLException;
import net.psammead.util.IOUtil;
import net.psammead.util.Logger;
import net.psammead.util.XMLCodec;
import au.id.jericho.lib.html.Attribute;
import au.id.jericho.lib.html.Element;
import au.id.jericho.lib.html.Source;

public class Scrapper {
	private static final Logger log	= new Logger(Scrapper.class);
	
	private HttpUtil	http;
	
	public Scrapper() throws MalformedURLException {
		http	= new HttpUtilCommons();
		http.useSystemProxy();
	}
	
	//=========================================================================
	//## basic info
	
	/** download the main page and return an Object containing its charset and the special namespace */
	public BasicInfo fetchBasicInfo(String protocol, String host, String prettyPath) throws IOException, IllegalFormException {
		// download main page
		URL		url		= new URL(protocol + host + prettyPath);
		HttpResult	content	= http.download(url);
		
		Source	source = JerichoUtil.createSource(content.body, log);
		Element	form			= JerichoUtil.fetchForm(source, "searchform", "searchform", -1);
		String	searchAction	= JerichoUtil.fetchAttributeValue(form.getStartTag(), "action");
		String	specialNs		= searchAction.replaceAll(".*/(.*):.*", "$1");
		
		return new BasicInfo(content.charset, specialNs);
	}
	
	//=========================================================================
	//## site info
	
	/** fetch the localized name of all namespaces using the siteinfo in Special:Export */
	@SuppressWarnings("unchecked")
	public SiteInfo fetchSiteInfo(String protocol, String host, String prettyPath) throws IOException {
		// uses a (hopefully) not existing title because we only want the siteinfo
		URL			url			= new URL(protocol + host + prettyPath + "Special:Export?action=submit&pages=23kl5jskdjfhskdfhslkfjsdkqweuh23&curonly=checked");
		HttpResult	content		= http.download(url);
		Source		source		= JerichoUtil.createSource(content.body, log);
		Element		siteinfo	= JerichoUtil.firstElement(source, "siteinfo");

		// TODO: should generate NameSpace objects!
		Map<Integer,String>	nameSpaces	= new HashMap<Integer,String>();
		List<Element>		elements	= siteinfo.findAllElements("namespace");
		for (Iterator<Element> it = elements.iterator(); it.hasNext();) {
			Element 	element = it.next();
			Attribute	key		= element.getAttributes().get("key");
			if (key == null)	throw new RuntimeException("namespace.key not found");
			int		index	= Integer.parseInt(key.getValue());
			// element.EmptyElementTag
			String	name	= element.getContent().toString();	// toString() was getSourceText()
			nameSpaces.put(new Integer(index), name);
		}
		
		String sitename		= JerichoUtil.firstElementText(siteinfo, "sitename");
		String base			= JerichoUtil.firstElementText(siteinfo, "base");
		String generator	= JerichoUtil.firstElementText(siteinfo, "generator");
		String titleCase	= JerichoUtil.firstElementText(siteinfo, "case");
		
		String				specialNS		= nameSpaces.get(-1);
		Map<String,String>	specialPages	= fetchSpecialPages(protocol, host, prettyPath, content.charset, specialNS);
		
		return new SiteInfo(sitename, base, generator, titleCase, specialPages, nameSpaces);
	}
	
	//=========================================================================
	//## special pages

	public Map<String,String> fetchSpecialPages(String protocol, String host, 
			String prettyPath, String charset, String specialNS) throws IOException {
		Map<String,String> out = new HashMap<String,String>();
		for (int i = 0; i < ConfigInfo.SPECIAL_PAGES.length; i++) {
			String canonical = ConfigInfo.SPECIAL_PAGES[i];
			String localized = fetchSpecialPage(protocol, host, prettyPath, charset, specialNS, canonical);
			out.put(canonical, localized);
		}
		return out;
	}
	
	// attention, specialNs already is URL-encoded
	public String fetchSpecialPage(String protocol, String host, String prettyPath, 
			String charset, String specialNS, String canonical) throws IOException {
		try {
			URL	url	= new URL(protocol + host + prettyPath + TitleUtil.encodeTitle(specialNS + ":" + canonical, charset));
			String	location	= http.redirectsTo(url);
			if (location == null)	return canonical;
			
//			// with both specialNS and specialPage canonical, we need to follow 2 redirects :/
//			URL	url2	= new URL(location);
//			String	location2	= redirectsTo(url2);
//			if (location2 != null)	location	= location2;
		
			String	raw		= location.replaceAll(".*/", "");
			String	title	= TitleUtil.spaces(TitleUtil.decodeTitle(raw, charset));
			return title.replaceAll(".*:", "");
		}
		catch (UnsupportedURLException e) {
			IOException ee = new IOException("cannot decode specialPage title: " + canonical);
			ee.initCause(e);
			throw ee;
		}
	}
	
	//=========================================================================
	//## messages
	
	// PHP parsing
	private final String	Q_VALUE		= "'((?:[^'\\\\]*+|\\\\.)*)'";
//	private final String	DQ_VALUE	= "\"((?:[^\"\\\\]*+|\\\\.)*)\"";
	private final String	q_decode(String s)	{ return s.replaceAll("\\\\'", "'");	}
//	private final String	dq_decode(String s)	{ return s.replaceAll("\\\\\"", "\"");	}
	
	private final String	BASE		= Q_VALUE + " => " + Q_VALUE + ",";
	private final String	PLAIN		= BASE + "\n";
	private final String	HASHED		= "#" + BASE + "\n";
	private final String	SLASHED		= "/\\* " + BASE + " \\*/\n";
	private final String	COMBINED	= SLASHED + "|" + HASHED + "|" + PLAIN;
	
	/** download the messages page and return an Object containing its charset and a Map of messages */
	private Map<String,String> fetchMessagesPHP(String protocol, String host, String rawPath, String uselang) throws IOException {
		// download allmessages page
		// TODO: would be faster using the specialNS
		URL			url		= new URL(protocol + host + rawPath + "?title=Special:Allmessages&ot=php&useskin=monobook&uselang=" + uselang);
		HttpResult	content	= http.download(url);
		
		// test
		if (!content.body.matches("(?s).*<!-- start content -->.*")) {
			IOUtil.writeStringToFile(new File("/tmp/scrapped.html"), content.body, "UTF-8");
			throw new RuntimeException("### start content not found");	
		}
		if (!content.body.matches("(?s).*<!-- end content -->.*")) {
			IOUtil.writeStringToFile(new File("/tmp/scrapped.html"), content.body, "UTF-8");
			throw new RuntimeException("### end content not found");	
		}
		if (!content.body.matches("(?s).*\n\\$(wgAllMessages|messages).*? = array\\(\n(.*,\n)\\);.*")) {
			IOUtil.writeStringToFile(new File("/tmp/scrapped.html"), content.body, "UTF-8");
			throw new RuntimeException("### content not found");	
		}
		// find messages php code
		// since 03aug06 on wikimedia sites it's messages instead of wgAllMessages
		Pattern	pattern	= Pattern.compile(".*?<!-- start content -->.*?\n\\$(?:wgAllMessages|messages).*? = array\\(\n(.*,\n)\\);.*?<!-- end content -->.*?", Pattern.DOTALL);
		Matcher	matcher	= pattern.matcher(content.body);
		if (!matcher.matches())	{
//			net.psammead.util.IOUtil.writeFile(new File("/tmp/scrapped.html"), content.body, "UTF-8");
			throw new RuntimeException("### no content matches found: " + url);
		}
		String	decoded		= XMLCodec.decode(matcher.group(1), true, false);
		//net.psammead.util.IOUtil.writeFile(new File("/tmp/content.php"), decoded, "UTF-8");
		//System.exit(1);
		
//		System.err.println("### combined=" + COMBINED);
		
		Map<String,String>	out	= new HashMap<String,String>();
		pattern	= Pattern.compile(COMBINED, Pattern.DOTALL);
		matcher	= pattern.matcher(decoded);
//		System.err.println("### finding");
		while (matcher.find()) {	//### StackOverflowError ???
//			System.err.println("### found " + matcher.start() + ".." + matcher.end() + ": " + decoded.substring(matcher.start(), matcher.end()));
			
			String	key;
			String	value;
			if (matcher.group(1) != null) {			// slashed
				key		= matcher.group(1);
				value	= matcher.group(2);
			}
			else if (matcher.group(3) != null) {	// hashed
				key		= matcher.group(3);
				value	= matcher.group(4);
			}
			else if (matcher.group(5) != null) {	// plain
				key		= matcher.group(5);
				value	= matcher.group(6);
			}
			else {
				continue;
			}
			value	= q_decode(value);
			out.put(key, value);
		}
		
		return out;
	}
	
	//=========================================================================
	//## messages
	
	/** download the messages page and return an Object containing its charset and a Map of messages */
	private Map<String,String> fetchMessagesXML(String protocol, String host, String rawPath, String uselang) throws IOException {
		// download allmessages page
		// TODO: would be faster using the specialNS
		URL			url		= new URL(protocol + host + rawPath + "?title=Special:Allmessages&ot=xml&uselang=" + uselang);
		HttpResult	content	= http.download(url);
		
		Map<String,String>	out	= new HashMap<String,String>();
		Source source = JerichoUtil.createSource(content.body, log);
		@SuppressWarnings("unchecked")
		List<Element> elements = source.findAllElements("message");
		for (Element element : elements) {
			String	key		= element.getAttributeValue("name");
//			String	value	= element.getContent().getTextExtractor().toString();	// getTextExtractor lscht linefeeds
//			String	value	= element.getContent().toString();						// ohne dekodiert er das XML nicht
			String	value	= JerichoUtil.decodedTextOnly(source, element.getContent());
			// TODO getTextExtractor lscht linefeeds, ohne dekodiert er das XML nicht
			out.put(key, value);
		}
		return out;
	}
	
	public Map<String,String> fetchMessages(String protocol, String host, String rawPath, String uselang) throws IOException {
		Map<String, String> messagesXML = fetchMessagesXML(protocol, host, rawPath, uselang);
		if (!messagesXML.isEmpty())	return messagesXML;
		log.info(host + ": could not get XML messages, trying PHP");
		Map<String, String> 	messagesPHP	= fetchMessagesPHP(protocol, host, rawPath, uselang);
		if (!messagesPHP.isEmpty())	return messagesPHP;
		throw new IOException("could not fetch messages");
	}
}
