<?php
/**
 * Speaker
 * Create an audio version of your posts, with a selection of more than 340 voices across more than 52 languages and variants.
 * Exclusively on https://1.envato.market/speaker
 *
 * @encoding        UTF-8
 * @version         4.1.10
 * @copyright       (C) 2018 - 2023 Merkulove ( https://merkulov.design/ ). All rights reserved.
 * @license         Envato License https://1.envato.market/KYbje
 * @contributors    Alexander Khmelnitskiy (info@alexander.khmelnitskiy.ua), Dmitry Merkulov (dmitry@merkulov.design)
 * @support         help@merkulov.design
 **/

namespace Merkulove\Speaker;

use DOMDocument;
use DOMException;
use DOMXPath;
use Merkulove\Speaker\Unity\Settings;

/** Exit if accessed directly. */
if ( ! defined( 'ABSPATH' ) ) {
	header( 'Status: 403 Forbidden' );
	header( 'HTTP/1.1 403 Forbidden' );
	exit;
}

/**
 * @package Merkulove\Speaker
 * @since 4.0.0
 */
final class Parser {

	/**
	 * Return post/page content by ID with executed shortcodes.
	 *
	 * @param int $post_id ID of the Post/Page content from which we will parse.
	 * @param int $page_index Page index for multipage posts.
	 * @param string|null $template Template name.
	 *
	 * @return mixed|null
	 * @throws DOMException
	 */
	public static function parse_post_content( int $post_id, int $page_index = 0, string $template = null ) {

		$options = Settings::get_instance()->options;

		/** Frontend url with post content to parse. */
		$url = self::get_frontend_url( $post_id, $template, $page_index );

		/** Get page content */
		$response = wp_remote_get(
			$url,
			array(
				'sslverify' => false,
				'timeout'   => 30,
			)
		);

		/** Throw error message */
		if ( is_wp_error( $response ) ) {

			$return = [
				'success' => false,
				'message' => esc_html__( 'Error connecting to', 'speaker' ) . ' ' . $url . ' ' . $response->get_error_message() . ' (' . $response->get_error_code() . ')',
			];
			wp_send_json( $return );

		}

		/** Get post content ot throw an error */
		$html = wp_remote_retrieve_body( $response );
		if ( $html === '' ) {

			$response_code = wp_remote_retrieve_response_code( $response );
			$return = [
				'success' => false,
				'message' => esc_html__( 'Failed to get content due to an error:', 'speaker' ) . 'HTTP: ' . $response_code . ' URL: ' . $url
			];
			wp_send_json( $return );

		}

		// Mute tags according to the Audio Content settings for content based generation
		if ( $template === 'speaker' ) {

			// Remove quotes block with any html inside and new lines inside and tabulation
			if ( $options[ 'read_quotes' ] === 'off' ) {
				$quotes_pattern = '/<blockquote\b[^>]*>([\s\S]*?)<\/blockquote>/';
				$html = preg_replace( $quotes_pattern, '', $html );
			}

			// Remove figcaption
			if ( $options[ 'read_figcaption' ] === 'off' ) {
				$figcaption_pattern = '/(<figcaption)+.+(<\/figcaption>)/';
				$html = preg_replace( $figcaption_pattern, '', $html );
			}

			// Extract image alt to figcaption
			if ( $options[ 'read_image_alt' ] === 'on' ) {
				$doc = new DOMDocument();
				$doc->loadHTML( $html );
				$xpath = new DOMXPath( $doc );
				$images = $xpath->query( '//img' );
				foreach ( $images as $image ) {
					$alt = $image->getAttribute( 'alt' );
					if ( $alt ) {
						$figcaption = $doc->createElement( 'figcaption', $alt );
						$image->parentNode->insertBefore( $figcaption, $image->nextSibling );
					}
				}
				$html = $doc->saveHTML();
			}

		}

		return apply_filters( 'speaker_parse_post_content', $html );

	}

	/**
	 * Parts of content for generate audio.
	 *
	 * @param $stid
	 * @param int $post_id
	 * @param int $page_index
	 * @param array $html_parts
	 *
	 * @return array|false|object
	 * @throws DOMException
	 */
	public static function content_parts( $stid, int $post_id, int $page_index = 0, array $html_parts = array() ) {

		if ( 'content' === $stid ) {

			/** Prepare parts for generate audio for whole post content. */
			$parts = SpeechGeneration::get_instance()->content_based_generation( $post_id, $page_index );

		} elseif ( 'custom-content' === $stid ) {

			/** Custom content generation $parts */
			$parts = $html_parts;

		} else {

			/** Prepare parts for generate audio for post based on Speech Template. */
			$parts = SpeechGeneration::get_instance()->template_based_generation( $post_id, $stid, $page_index );

			/** On error. */
			if ( empty( $parts ) ) { return false; }

		}

		return $parts;

	}

	/**
	 * Return frontend url with post content to parse.
	 *
	 * @param int $post_id ID of the Post/Page content from which we will parse.
	 * @param string|null $template Template name.
	 * @param int $page_index
	 *
	 * @return string
	 */
	private static function get_frontend_url( int $post_id, string $template = null, int $page_index = 0 ): string {

		/** Get full permalink for the current post. */
		$url = get_permalink( $post_id );

		/** Returns a string if the URL has parameters or NULL if not. */
		$query = parse_url( $url, PHP_URL_QUERY );

		/** Add speaker-ssml param to URL. */
		$url .= ( $query ) ? '&speaker-ssml=1' : '?speaker-ssml=1';

		/** Add template param to url. */
		if ( $template ) {
			$url .=  '&speaker-template=' . $template;
		}

		/** Add page index param to url */
		if ( $page_index > 0 ) {
			$url .=  '&page=' . $page_index;
		}

		return $url;

	}

	/**
	 * Clean post content from styles and scripts tags.
	 *
	 * @param $post_content
	 *
	 * @return string|string[]|null
	 * @throws DOMException
	 */
	public static function clean_content( $post_content ) {

		/** Remove <script>...</script>. */
		$post_content = preg_replace( '/<\s*script.+?<\s*\/\s*script.*?>/si', ' ', $post_content );

		/** Remove <style>...</style>. */
		$post_content = preg_replace( '/<\s*style.+?<\s*\/\s*style.*?>/si', ' ', $post_content );

		/** Trim, replace tabs and extra spaces with single space. */
		$post_content = preg_replace( '/[ ]{2,}|[\t]/', ' ', trim( $post_content ) );

		/** Remove muted elements by class "speaker-mute" or attribute speaker-mute="". */
		$post_content = self::remove_muted_html( $post_content );

		/** Convert data attributes to the SSML markup */ // TODO SpeechUtilities
		$post_content = SpeechTemplates::get_instance()->apply_ssml_attributes( $post_content );

		/** Prepare HTML to splitting. */
		return XMLHelper::get_instance()->clean_html( $post_content );

	}

	/**
	 * Remove muted elements by class "speaker-mute" or attribute speaker-mute="".
	 *
	 * @param string $post_content Post/Page content.
	 *
	 * @return string
	 */
	private static function remove_muted_html( string $post_content ): string {

		/** Hide DOM parsing errors. */
		libxml_use_internal_errors( true );
		libxml_clear_errors();

		/** Load the possibly malformed HTML into a DOMDocument. */
		$dom          = new DOMDocument();
		$dom->recover = true;
		$dom->loadHTML( '<?xml encoding="UTF-8"><body id="repair">' . $post_content . '</body>' ); // input UTF-8.

		$selector = new DOMXPath( $dom );

		/** Remove all elements with speaker-mute="" attribute. */
		foreach( $selector->query( '//*[@speaker-mute]') as $e ) {
			$e->parentNode->removeChild( $e );
		}

		/** Remove all elements with class="speaker-mute". */
		foreach( $selector->query( '//*[contains(attribute::class, "speaker-mute")]' ) as $e ) {
			$e->parentNode->removeChild( $e );
		}

		/** HTML without muted tags. */
		$body = $dom->documentElement->lastChild;

		return trim( XMLHelper::get_instance()->get_inner_html( $body ) );

	}

	/**
	 * Regex content replacement
	 *
	 * @param string $post_content Post content.
	 *
	 * @return mixed|null
	 */
	public static function regex_content_replace( string $post_content ) {

		$options = Settings::get_instance()->options;

		/** Apply regex replacement */
		if ( $options[ 'regex' ] === 'on' && $options[ 'regex_pattern' ] !== '' ) {

			$expressions = preg_split("/\r\n|\n|\r/", $options[ 'regex_pattern' ] );

			foreach ( $expressions as $i => $exp ) {

				if ( ! ( $i % 2 == 0 ) ){
					$post_content = preg_replace( $expressions[ $i - 1 ], $exp, $post_content );
				}

			}

		}

		return apply_filters( 'speaker_after_content_regex_replace', $post_content );

	}

	/**
	 * Split text by tags and return array of tags.
	 *
	 * @param array $html_array
	 * @param int $maxBytes
	 *
	 * @return array
	 */
	public static function split_tags( array $html_array, int $maxBytes = 4999 ): array {

		$parts = [];

		$current   = "";
		foreach ( $html_array as $el ) {

			if ( strlen( $current ) + strlen( $el ) >= $maxBytes ) {

				// Save previous part.
				$parts[] = $current;
                $current = "";

				// Split elements by sentences
				if ( strlen( $el ) >= $maxBytes ) {
					$sub_parts = self::split_sentence( $el, $maxBytes );
					$parts = array_merge( $parts, $sub_parts );
				} else {
					$current = $el;
				}

			} else {

				$current .= $el;

			}

		}
		$parts[] = $current;
		return $parts;

	}

    /**
     * Get divider for an each element
     * @param $el
     * @return string
     */
    private static function getDividerSymbol( $el ): string {

        if ( count( explode( '. ', $el ) ) > 1 ) {
            return '. ';
        } else if ( count( explode( '.', $el ) ) > 1 ) {
            return '.';
        } else if ( count( explode( ';', $el ) ) > 1 ) {
            return ';';
        } else if ( count( explode( '?', $el ) ) > 1 ) {
            return '?';
        } else if ( count( explode( '!', $el ) ) > 1 ) {
            return '!';
        } else if ( count( explode( ',', $el ) ) > 1 ) {
            return ',';
        } else {
            return ' ';
        }

    }

	/**
	 * Split sentence by sentences and return array of sentences.
	 *
	 * @param string $el
	 * @param int $maxBytes
	 *
	 * @return array
	 */
	private static function split_sentence( string $el, int $maxBytes = 4999 ): array {

        $dividerSymbol = Parser::getDividerSymbol( $el );

		$parts = [];
		$current   = "";
		foreach ( explode( $dividerSymbol, $el ) as $p ) {

			// Skip if part is too long.
			if ( strlen( $p ) >= $maxBytes ) {
				continue;
			}

			// If the current part is too long, save it and start new.
			if ( strlen( $current ) + strlen( $p ) >= $maxBytes ) {

				// Save the previous part.
				$parts[] = $current . $dividerSymbol;
				$current = $p;

			} else {

				// Add new part to current.
				$current .= $dividerSymbol . $p;

			}

		}
		$parts[] = $current;

		return $parts;

	}

	/**
	 * Prepare HTML for Google TTS.
	 *
	 * @param string $html  Post/Page content to split.
	 * @param int $max      Post ID
	 *
	 * @return array HTML parts to speech.
	 */
	public static function great_divider( $post_id, string $html, int $max = 4500 ): array {

		/** Studio voices can proceed only 500 bytes per request */
		$options = Settings::get_instance()->options;
		$voice_properties = SpeechGeneration::get_instance()->voice_properties( $post_id, $html, $options );
		if ( strpos( $voice_properties[ 1 ], 'Studio' ) ) { $max = 500; }

		/** Get voice wrapper for whole content */
		$voice_tag = (object)array();
		$is_voice_wrapper = false;
		if ( preg_match( '/^(<voice)/', $html ) === 1 && preg_match( '/(<\/voice>)$/', $html ) === 1 ) {

			/** Get open and close voice tags */
			preg_match( '/^(<voice)\s\S+>/', $html, $voice_tag_start );
			preg_match( '/(<\/voice>)$/', $html, $voice_tag_end );

			/** Remove voice tag for all content */
			$html = preg_replace( '/^(<voice)\s\S+>/', '', $html, 1 );
			$html = preg_replace( '/(<\/voice>)$/', '', $html, 1 );

			/** Store voice tags in variable */
			if ( is_array( $voice_tag_start ) && is_array( $voice_tag_end ) ) {

				$voice_tag = [
					'open' => $voice_tag_start[ 0 ],
					'close' => $voice_tag_end[ 0 ],
				];

				$is_voice_wrapper = true;

			}

		}

		/** Divide HTML by closing tags '</' */
		$html_array = preg_split( '/(<\/)/', $html );
		$html_array = array_filter( $html_array );

		/** Fix broken tags, add '</' to all except first element. */
		$count = 0;
		foreach ( $html_array as $i => $el ) {
			$count ++;
			if ( $count === 1 ) {
				continue;
			} // Skip first element.

			$html_array[ $i ] = '</' . $el;
		}

		/** Fix broken html. */
		foreach ( $html_array as $i => $el ) {
			$html_array[ $i ] = XMLHelper::get_instance()->repair_html( $el );
		}

		/** Remove empty elements. */
		$html_array = array_filter( $html_array );

		/** Divide into parts. */
		$parts = Parser::split_tags( $html_array, $max);

		/** Add voice wrapper for whole content, which was added for whole content */
		if ( $is_voice_wrapper ) {

			array_walk( $parts, [ Parser::class, 'voice_tag_wrap' ], $voice_tag );

		}

		return $parts;

	}

	/**
	 * Wrap content in the voice tag
	 *
	 * @param $html
	 * @param $key
	 * @param $tag
	 */
	private static function voice_tag_wrap( &$html, $key, $tag ) {

		$html = $tag[ 'open' ] . $html . $tag[ 'close' ];

	}

}
