<?php

namespace ExternalImporter\application\components\scrap;

defined('\ABSPATH') || exit;

use ExternalImporter\application\helpers\TextHelper;
use ExternalImporter\application\admin\ParserConfig;

use function ExternalImporter\prn;
use function ExternalImporter\prnx;

/**
 * Scrap class file
 *
 * @author keywordrush.com <support@keywordrush.com>
 * @link https://www.keywordrush.com
 * @copyright Copyright &copy; 2025 keywordrush.com
 */
abstract class Scrap
{
    const SLUG = '';

    protected static $last_used_provider = null;
    protected static $last_used_rule = null;

    abstract public function doAction($url, $args);

    public function getSlug()
    {
        return static::SLUG;
    }

    public function initAction()
    {
        if (!$this->getToken())
        {
            return;
        }

        \add_action('ei_create_from_url', array($this, 'doAction'), 10, 2);
    }

    public function getToken()
    {
        $option_name = $this->getSlug() . '_token';
        return ParserConfig::getInstance()->option($option_name);
    }

    protected function markScrapingServiceUsed($url, array $rule)
    {
        self::$last_used_provider = $this->getSlug();
        self::$last_used_rule     = $rule;

        do_action('ei_scraping_service_used', self::$last_used_provider, $url, $rule);
    }

    public static function getLastUsedProvider()
    {
        return self::$last_used_provider;
    }

    public static function getLastUsedRule()
    {
        return self::$last_used_rule;
    }

    public static function resetLastUsedScrapingService()
    {
        self::$last_used_provider = null;
        self::$last_used_rule     = null;
    }

    /**
     * Return the first matching routing rule for the given URL, or null.
     * Global rule order is respected (top to bottom).
     *
     * @param string $url
     * @return array|null
     */
    protected function getRoutingRuleForUrl($url)
    {
        static $routing_rules = null;

        // Cache routing rules for this request.
        if ($routing_rules === null)
        {
            $rules = ParserConfig::getInstance()->option('routing_rules');
            $routing_rules = is_array($rules) ? $rules : array();
        }

        if (empty($routing_rules))
        {
            return null;
        }

        // Parse host + path once per call
        $host = TextHelper::getHostName($url);
        if (empty($host))
        {
            return null;
        }
        $host = strtolower($host);

        $path = parse_url($url, PHP_URL_PATH);
        if ($path === null || $path === false || $path === '')
        {
            $path = '/';
        }

        // Rules are evaluated from top to bottom; first match wins globally.
        foreach ($routing_rules as $rule)
        {
            if (empty($rule['pattern']))
            {
                continue;
            }

            $pattern = $rule['pattern'];

            if ($this->matchesRoutingPatternComponents($host, $path, $pattern))
            {
                // Return the whole rule, including provider & params
                return $rule;
            }
        }

        return null;
    }

    /**
     * Check whether this provider should handle the URL.
     *
     * @param string $url
     * @return bool
     */
    public function needSendThrough($url)
    {
        $rule = $this->getRoutingRuleForUrl($url);
        if (!$rule)
        {
            return false;
        }

        $current_provider = $this->getSlug();
        $provider         = isset($rule['provider']) ? $rule['provider'] : '';

        return ($provider === $current_provider);
    }

    /**
     * Check if given host + path match a routing pattern.
     *
     * Pattern examples:
     *  - example.com
     *  - *.example.com
     *  - example.com/path/*
     *
     * @param string $host
     * @param string $path
     * @param string $pattern
     * @return bool
     */
    protected function matchesRoutingPatternComponents($host, $path, $pattern)
    {
        $pattern = trim($pattern);
        if ($pattern === '')
        {
            return false;
        }

        // Split pattern into host and optional path parts
        $parts       = explode('/', $pattern, 2);
        $hostPattern = strtolower($parts[0]);
        $pathPattern = isset($parts[1]) ? $parts[1] : '';

        if (!$this->matchesHostPattern($host, $hostPattern))
        {
            return false;
        }

        if ($pathPattern === '')
        {
            // Host-only rule
            return true;
        }

        return $this->matchesPathPattern($path, $pathPattern);
    }

    /**
     * Match host against host pattern.
     *
     * Supported:
     *  - example.com       (exact)
     *  - *.example.com     (any subdomain of example.com)
     *
     * @param string $host
     * @param string $hostPattern
     * @return bool
     */
    protected function matchesHostPattern($host, $hostPattern)
    {
        $host        = strtolower($host);
        $hostPattern = strtolower(trim($hostPattern));

        if ($hostPattern === '')
        {
            return false;
        }

        // Full wildcard (not exposed, but safe)
        if ($hostPattern === '*')
        {
            return true;
        }

        // Subdomain wildcard: *.example.com
        if (strpos($hostPattern, '*.') === 0)
        {
            $suffix = substr($hostPattern, 2); // "example.com"

            if ($host === $suffix)
            {
                // plain example.com is NOT a subdomain of *.example.com
                return false;
            }

            return (substr($host, -strlen('.' . $suffix)) === '.' . $suffix);
        }

        // Exact match (treat leading "www." as optional)
        $host_no_www        = preg_replace('~^www\.~i', '', $host);
        $hostPattern_no_www = preg_replace('~^www\.~i', '', $hostPattern);

        return ($host_no_www === $hostPattern_no_www);
    }

    /**
     * Match URL path against path pattern (part after host).
     *
     * Patterns:
     *  - path/*    => prefix match (/path/)
     *  - path      => exact match (/path)
     *  - * or /*   => any path
     *
     * @param string $path        e.g. "/path/to/page"
     * @param string $pathPattern e.g. "path/*" or "path"
     * @return bool
     */
    protected function matchesPathPattern($path, $pathPattern)
    {
        $pathPattern = trim($pathPattern);

        // Empty => no path constraint
        if ($pathPattern === '')
        {
            return true;
        }

        // A single "*" means "any path"
        if ($pathPattern === '*')
        {
            return true;
        }

        // Normalize to leading slash: "path/*" => "/path/*"
        $pathPattern = '/' . ltrim($pathPattern, '/');

        // Special-case "/*" => any path
        if ($pathPattern === '/*')
        {
            return true;
        }

        // Trailing "*" means "prefix match"
        if (substr($pathPattern, -1) === '*')
        {
            $prefix = substr($pathPattern, 0, -1); // remove '*', keep leading slash

            return (strncmp($path, $prefix, strlen($prefix)) === 0);
        }

        // Otherwise: exact path match
        return ($path === $pathPattern);
    }

    /**
     * Check if URL matches a routing pattern.
     *
     * Examples:
     *  - example.com
     *  - *.example.com
     *  - example.com/path/*
     *
     * @param string $url
     * @param string $pattern
     * @return bool
     */
    protected function matchesRoutingPattern($url, $pattern)
    {
        $pattern = trim($pattern);
        if ($pattern === '')
        {
            return false;
        }

        $host = TextHelper::getHostName($url);
        if (empty($host))
        {
            return false;
        }
        $host = strtolower($host);

        $path = \parse_url($url, PHP_URL_PATH);
        if ($path === null || $path === false)
        {
            $path = '/';
        }
        elseif ($path === '')
        {
            $path = '/';
        }

        // Split pattern into host part and optional path part:
        // example.com/path/*  => hostPattern="example.com", pathPattern="path/*"
        $parts       = explode('/', $pattern, 2);
        $hostPattern = strtolower($parts[0]);
        $pathPattern = isset($parts[1]) ? $parts[1] : '';

        if (!$this->matchesHostPattern($host, $hostPattern))
        {
            return false;
        }

        if ($pathPattern === '')
        {
            // Host-only rule
            return true;
        }

        return $this->matchesPathPattern($path, $pathPattern);
    }

    /**
     * Extract additional query parameters from a routing rule.
     *
     * @param array $rule
     * @param array $protected_keys Keys that must NOT be overridden (e.g. api_key, url).
     *
     * @return array Extra query args to merge into provider request URL.
     */
    protected function getExtraParamsFromRule($rule, $protected_keys = array())
    {
        if (!is_array($rule) || empty($rule['params']) || !is_string($rule['params']))
        {
            return array();
        }

        $params_string = trim($rule['params']);
        if ($params_string === '')
        {
            return array();
        }

        $extra = array();
        // Parse query-like string: premium=true&country_code=us
        parse_str($params_string, $extra);

        if (!is_array($extra) || empty($extra))
        {
            return array();
        }

        if (!empty($protected_keys) && is_array($protected_keys))
        {
            foreach ($protected_keys as $key)
            {
                unset($extra[$key]);
            }
        }

        return $extra;
    }
}
