<?php
namespace App\Parser;

use Symfony\Component\DomCrawler\Crawler;

class ProductParser
{
    public function parse(string $html): array
    {
        $crawler = new Crawler($html);

        $jsonLd = $this->extractJsonLd($crawler);
        $product = $jsonLd['product'] ?? null;
        $breadcrumb = $jsonLd['breadcrumb'] ?? null;

        $name = $product['name'] ?? ($crawler->filter('h1')->count() ? trim($crawler->filter('h1')->first()->text()) : null);
        $sku = $product['sku'] ?? null;
        $brand = $product['brand']['name'] ?? null;
        $offers = $product['offers'] ?? null;
        $price = is_array($offers) ? ($offers['price'] ?? null) : null;
        $currency = is_array($offers) ? ($offers['priceCurrency'] ?? 'SAR') : 'SAR';
        $availability = is_array($offers) ? ($offers['availability'] ?? null) : null;

        $canonical = $crawler->filterXPath('//link[@rel="canonical"]')->count()
            ? $crawler->filterXPath('//link[@rel="canonical"]')->attr('href')
            : null;

        // Description: prefer html block if exists
        $descHtml = null;
        if ($crawler->filter('.product__description')->count()) {
            $descHtml = $crawler->filter('.product__description')->html();
        } elseif (isset($product['description'])) {
            $descHtml = $product['description'];
        }
        $descText = $descHtml ? trim(strip_tags($descHtml)) : null;

        $categories = [];
        if (is_array($breadcrumb) && isset($breadcrumb['itemListElement']) && is_array($breadcrumb['itemListElement'])) {
            foreach ($breadcrumb['itemListElement'] as $it) {
                $n = $it['item']['name'] ?? null;
                if ($n) $categories[] = $n;
            }
        }

        $images = $this->extractImages($crawler, $product);

        $mainImage = $images[0] ?? ($product['image'] ?? null);
        if (is_array($mainImage)) $mainImage = $mainImage[0] ?? null;

        return [
            'source_url' => $canonical,
            'sku' => $sku,
            'name' => $name,
            'price' => $price,
            'currency' => $currency,
            'availability' => $availability,
            'brand' => $brand,
            'categories' => $categories,
            'description_html' => $descHtml,
            'description_text' => $descText,
            'images' => $images,
            'main_image_url' => is_string($mainImage) ? $mainImage : null,
        ];
    }

    private function extractJsonLd(Crawler $crawler): array
    {
        $product = null;
        $breadcrumb = null;

        $crawler->filterXPath('//script[@type="application/ld+json"]')->each(function(Crawler $n) use (&$product, &$breadcrumb) {
            $json = trim($n->text());
            if ($json === '') return;
            $data = json_decode(html_entity_decode($json, ENT_QUOTES | ENT_HTML5, 'UTF-8'), true);
            if (!is_array($data)) return;

            $candidates = [];
            if (isset($data['@type'])) $candidates[] = $data;
            if (isset($data['@graph']) && is_array($data['@graph'])) $candidates = array_merge($candidates, $data['@graph']);

            foreach ($candidates as $item) {
                if (($item['@type'] ?? null) === 'Product') $product = $item;
                if (($item['@type'] ?? null) === 'BreadcrumbList') $breadcrumb = $item;
            }
        });

        return ['product' => $product, 'breadcrumb' => $breadcrumb];
    }

    private function extractImages(Crawler $crawler, ?array $product): array
    {
        $urls = [];

        // Prefer gallery images (data-src points to CDN)
        $crawler->filter('img')->each(function(Crawler $img) use (&$urls) {
            $u = $img->attr('data-src') ?: $img->attr('src');
            if ($u && preg_match('~^https?://~', $u)) {
                $urls[] = $u;
            }
        });

        // Fallback: JSON-LD image
        if (empty($urls) && is_array($product) && isset($product['image'])) {
            $img = $product['image'];
            if (is_string($img) && preg_match('~^https?://~', $img)) $urls[] = $img;
            if (is_array($img)) {
                foreach ($img as $u) {
                    if (is_string($u) && preg_match('~^https?://~', $u)) $urls[] = $u;
                }
            }
        }

        // Unique keep order
        $out = [];
        $seen = [];
        foreach ($urls as $u) {
            if (isset($seen[$u])) continue;
            $seen[$u] = true;
            $out[] = $u;
        }
        return $out;
    }
}
