# Copyright (C) 2024 Wildfire Games.
# All rights reserved.
#
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
#   notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
#   notice, this list of conditions and the following disclaimer in the
#   documentation and/or other materials provided with the distribution.
# * The name of the author may not be used to endorse or promote products
#   derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR “AS IS” AND ANY EXPRESS OR IMPLIED
# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import json
import os
import re
import sys
from functools import lru_cache
from textwrap import dedent

from babel.messages.jslexer import tokenize, unquote_string
from lxml import etree


@lru_cache
def get_mask_pattern(mask: str) -> re.Pattern:
    """Build a regex pattern for matching file paths."""
    parts = re.split(r"([*][*]?)", mask)
    pattern = ""
    for i, part in enumerate(parts):
        if i % 2 != 0:
            pattern += "[^/]+"
            if len(part) == 2:
                pattern += "(/[^/]+)*"
        else:
            pattern += re.escape(part)
    pattern += "$"
    return re.compile(pattern)


def pathmatch(mask, path):
    """Match paths to a mask, where the mask supports * and **.

    Paths use / as the separator
    * matches a sequence of characters without /.
    ** matches a sequence of characters without / followed by a / and
    sequence of characters without /
    :return: true if path matches the mask, false otherwise
    """
    return get_mask_pattern(mask).match(path) is not None


class Extractor:
    def __init__(self, directory_path, filemasks, options):
        self.directory_path = directory_path
        self.options = options

        if isinstance(filemasks, dict):
            self.include_masks = filemasks["includeMasks"]
            self.exclude_masks = filemasks["excludeMasks"]
        else:
            self.include_masks = filemasks
            self.exclude_masks = []

    def run(self):
        """Extract messages.

        :return:    An iterator over ``(message, plural, context, (location, pos), comment)``
                    tuples.
        :rtype:     ``iterator``
        """
        empty_string_pattern = re.compile(r"^\s*$")
        directory_absolute_path = os.path.abspath(self.directory_path)
        for root, folders, filenames in os.walk(directory_absolute_path):
            for subdir in folders:
                if subdir.startswith((".", "_")):
                    folders.remove(subdir)
            folders.sort()
            filenames.sort()
            for filename in filenames:
                filename = os.path.relpath(
                    os.path.join(root, filename), self.directory_path
                ).replace(os.sep, "/")
                for filemask in self.exclude_masks:
                    if pathmatch(filemask, filename):
                        break
                else:
                    for filemask in self.include_masks:
                        if not pathmatch(filemask, filename):
                            continue

                        filepath = os.path.join(directory_absolute_path, filename)
                        for message, plural, context, position, comments in self.extract_from_file(
                            filepath
                        ):
                            if empty_string_pattern.match(message):
                                continue

                            if " " in filename or "\t" in filename:
                                filename = "\u2068" + filename + "\u2069"
                            yield message, plural, context, (filename, position), comments

    def extract_from_file(self, filepath):
        """Extract messages from a specific file.

        :return:    An iterator over ``(message, plural, context, position, comments)`` tuples.
        :rtype:     ``iterator``
        """


class JavascriptExtractor(Extractor):
    """Extract messages from JavaScript source code."""

    empty_msgid_warning = (
        '%s: warning: Empty msgid.  It is reserved by GNU gettext: gettext("") '
        "returns the header entry with meta information, not the empty string."
    )

    def extract_javascript_from_file(self, file_object):
        funcname = message_lineno = None
        messages = []
        last_argument = None
        translator_comments = []
        concatenate_next = False
        last_token = None
        call_stack = -1
        comment_tags = self.options.get("commentTags", [])
        keywords = self.options.get("keywords", {}).keys()

        for token in tokenize(file_object.read(), dotted=False):
            if token.type == "operator" and (
                token.value == "(" or (call_stack != -1 and (token.value in ("[", "{")))
            ):
                if funcname:
                    message_lineno = token.lineno
                    call_stack += 1

            elif call_stack == -1 and token.type == "linecomment":
                value = token.value[2:].strip()
                if translator_comments and translator_comments[-1][0] == token.lineno - 1:
                    translator_comments.append((token.lineno, value))
                    continue

                for comment_tag in comment_tags:
                    if value.startswith(comment_tag):
                        translator_comments.append((token.lineno, value.strip()))
                        break

            elif token.type == "multilinecomment":
                # only one multi-line comment may preceed a translation
                translator_comments = []
                value = token.value[2:-2].strip()
                for comment_tag in comment_tags:
                    if value.startswith(comment_tag):
                        lines = value.splitlines()
                        if lines:
                            lines[0] = lines[0].strip()
                            lines[1:] = dedent("\n".join(lines[1:])).splitlines()
                            for offset, line in enumerate(lines):
                                translator_comments.append((token.lineno + offset, line))
                        break

            elif funcname and call_stack == 0:
                if token.type == "operator" and token.value == ")":
                    if last_argument is not None:
                        messages.append(last_argument)
                    if len(messages) > 1:
                        messages = tuple(messages)
                    elif messages:
                        messages = messages[0]
                    else:
                        messages = None

                    # Comments don't apply unless they immediately precede the
                    # message
                    if translator_comments and translator_comments[-1][0] < message_lineno - 1:
                        translator_comments = []

                    if messages is not None:
                        yield (
                            message_lineno,
                            funcname,
                            messages,
                            [comment[1] for comment in translator_comments],
                        )

                    funcname = message_lineno = last_argument = None
                    concatenate_next = False
                    translator_comments = []
                    messages = []
                    call_stack = -1

                elif token.type == "string":
                    new_value = unquote_string(token.value)
                    if concatenate_next:
                        last_argument = (last_argument or "") + new_value
                        concatenate_next = False
                    else:
                        last_argument = new_value

                elif token.type == "operator":
                    if token.value == ",":
                        if last_argument is not None:
                            messages.append(last_argument)
                            last_argument = None
                        else:
                            messages.append(None)
                        concatenate_next = False
                    elif token.value == "+":
                        concatenate_next = True

            elif call_stack > 0 and token.type == "operator" and (token.value in (")", "]", "}")):
                call_stack -= 1

            elif funcname and call_stack == -1:
                funcname = None

            elif (
                call_stack == -1
                and token.type == "name"
                and token.value in keywords
                and (
                    last_token is None
                    or last_token.type != "name"
                    or last_token.value != "function"
                )
            ):
                funcname = token.value

            last_token = token

    def extract_from_file(self, filepath):
        with open(filepath, encoding="utf-8-sig") as file_object:
            for lineno, funcname, messages, comments in self.extract_javascript_from_file(
                file_object
            ):
                spec = self.options.get("keywords", {})[funcname] or (1,) if funcname else (1,)
                if not isinstance(messages, (list, tuple)):
                    messages = [messages]
                if not messages:
                    continue

                # Validate the messages against the keyword's specification
                context = None
                msgs = []
                invalid = False
                # last_index is 1 based like the keyword spec
                last_index = len(messages)
                for index in spec:
                    if isinstance(index, (list, tuple)):
                        context = messages[index[0] - 1]
                        continue
                    if last_index < index:
                        # Not enough arguments
                        invalid = True
                        break
                    message = messages[index - 1]
                    if message is None:
                        invalid = True
                        break
                    msgs.append(message)
                if invalid:
                    continue

                # keyword spec indexes are 1 based, therefore '-1'
                if isinstance(spec[0], (tuple, list)):
                    # context-aware *gettext method
                    first_msg_index = spec[1] - 1
                else:
                    first_msg_index = spec[0] - 1
                if not messages[first_msg_index]:
                    # An empty string msgid isn't valid, emit a warning
                    where = "%s:%i" % (
                        hasattr(file_object, "name") and file_object.name or "(unknown)",
                        lineno,
                    )
                    print(self.empty_msgid_warning % where, file=sys.stderr)
                    continue

                messages = tuple(msgs)
                message = messages[0]
                plural = None
                if len(messages) == 2:
                    plural = messages[1]

                yield message, plural, context, lineno, comments


class CppExtractor(JavascriptExtractor):
    """Extract messages from C++ source code."""


class TxtExtractor(Extractor):
    """Extract messages from plain text files."""

    def extract_from_file(self, filepath):
        with open(filepath, encoding="utf-8-sig") as file_object:
            for lineno, line in enumerate([line.strip("\n\r") for line in file_object], start=1):
                if line:
                    yield line, None, None, lineno, []


class JsonExtractor(Extractor):
    """Extract messages from JSON files."""

    def __init__(self, directory_path=None, filemasks=None, options=None):
        if options is None:
            options = {}
        if filemasks is None:
            filemasks = []
        super().__init__(directory_path, filemasks, options)
        self.keywords = self.options.get("keywords", {})
        self.context = self.options.get("context", None)
        self.comments = self.options.get("comments", [])

    def set_options(self, options):
        self.options = options
        self.keywords = self.options.get("keywords", {})
        self.context = self.options.get("context", None)
        self.comments = self.options.get("comments", [])

    def extract_from_file(self, filepath):
        with open(filepath, encoding="utf-8") as file_object:
            for message, context in self.extract_from_string(file_object.read()):
                yield message, None, context, None, self.comments

    def extract_from_string(self, string):
        json_document = json.loads(string)
        yield from self.parse(json_document)

    def parse(self, data, key=None):
        """Recursively parse JSON data and extract strings."""
        if isinstance(data, list):
            for item in data:
                yield from self.parse(item)
        elif isinstance(data, dict):
            for key2, value in data.items():
                if key2 in self.keywords:
                    if isinstance(value, str):
                        yield self.extract_string(value, key2)
                    elif isinstance(value, list):
                        yield from self.extract_list(value, key2)
                    elif isinstance(value, dict):
                        if self.keywords[key2].get("extractFromInnerKeys"):
                            for value2 in value.values():
                                yield from self.parse(value2, key2)
                        else:
                            yield from self.extract_dictionary(value, key2)
                else:
                    yield from self.parse(value, key2)
        elif isinstance(data, str) and key in self.keywords:
            yield self.extract_string(data, key)

    def extract_string(self, string, keyword):
        if "tagAsContext" in self.keywords[keyword]:
            context = keyword
        elif "customContext" in self.keywords[keyword]:
            context = self.keywords[keyword]["customContext"]
        else:
            context = self.context
        return string, context

    def extract_list(self, items_list, keyword):
        for list_item in items_list:
            if isinstance(list_item, str):
                yield self.extract_string(list_item, keyword)
            elif isinstance(list_item, dict):
                extract = self.extract_dictionary(list_item[keyword], keyword)
                if extract:
                    yield extract

    def extract_dictionary(self, dictionary, keyword):
        message = dictionary.get("_string", None)
        if message and isinstance(message, str):
            if "context" in dictionary:
                context = str(dictionary["context"])
            elif "tagAsContext" in self.keywords[keyword]:
                context = keyword
            elif "customContext" in self.keywords[keyword]:
                context = self.keywords[keyword]["customContext"]
            else:
                context = self.context
            yield message, context


class XmlExtractor(Extractor):
    """Extract messages from XML files."""

    def __init__(self, directory_path, filemasks, options):
        super().__init__(directory_path, filemasks, options)
        self.keywords = self.options.get("keywords", {})
        self.json_extractor = None

    def get_json_extractor(self):
        if not self.json_extractor:
            self.json_extractor = JsonExtractor(self.directory_path)
        return self.json_extractor

    def extract_from_file(self, filepath):
        with open(filepath, encoding="utf-8-sig") as file_object:
            xml_document = etree.parse(file_object)

        for element in xml_document.iter(*self.keywords.keys()):
            keyword = element.tag

            lineno = element.sourceline
            if element.text is None:
                continue

            comments = []
            if "extractJson" in self.keywords[keyword]:
                json_extractor = self.get_json_extractor()
                json_extractor.set_options(self.keywords[keyword]["extractJson"])
                for message, context in json_extractor.extract_from_string(element.text):
                    yield message, None, context, lineno, comments
            else:
                context = None
                if "context" in element.attrib:
                    context = str(element.get("context"))
                elif "tagAsContext" in self.keywords[keyword]:
                    context = keyword
                elif "customContext" in self.keywords[keyword]:
                    context = self.keywords[keyword]["customContext"]
                if "comment" in element.attrib:
                    comment = element.get("comment")
                    comment = " ".join(
                        comment.split()
                    )  # Remove tabs, line breaks and unnecessary spaces.
                    comments.append(comment)
                if "splitOnWhitespace" in self.keywords[keyword]:
                    for split_text in element.text.split():
                        # split on whitespace is used for token lists, there, a
                        # leading '-' means the token has to be removed, so it's not
                        # to be processed here either
                        if split_text[0] != "-":
                            yield str(split_text), None, context, lineno, comments
                else:
                    yield str(element.text), None, context, lineno, comments