Updated a way of parsing a single book page to find language, publisher and book description. Also, url of a page is now attached to the book info object.

2022-05-27 10:54:39 +02:00
parent 825348ce7a
commit 293a776d16
3 changed files with 11 additions and 8 deletions
@@ -18,21 +18,23 @@ class DataParser
    {
        $info = new Info();
        $jsonInfo = json_decode($document->first('script[type="application/ld+json"]')->text());
+        $info->url = $document->getDocument()->baseURI;
+        $info->publisher = $document->first('a[href*="wydawnictwo"]')?->text();
        $info->author = $document->first('meta[property="books:author"]')->getAttribute('content');
        $info->isbn = $document->first('meta[property="books:isbn"]')->getAttribute('content');
-        $info->description = $document->first('meta[property="og:description"]')->getAttribute('content');
+        $info->description = trim($document->first('#book-description p')->text());
        $info->title = trim($document->first('h1.book__title')->text());
        $info->category = trim($document->first('.book__category')->text());
        $info->cover_url = $this->generateCoverUrls(
            $document->first('meta[property="og:image"]')->getAttribute('content')
        );
        $info->pages = (int)$document->first('span.book__pages')?->text();
-        if (preg_match('#(.*) \(tom (\d*)\)#ism', trim($document->first('a[href*="/cykl/"]')?->text()), $series)) {
+        if (preg_match('#(.*) \(tom (\d*)\)#ism', trim($document->first('a[href*="/cykl/"]')?->text() ?? ''), $series)) {
            $info->cycle = $series[1];
            $info->volume = $series[2];
        }
-        $info->language = $jsonInfo->inLanguage ?? null;
-        $info->datePublished = $jsonInfo->datePublished ?? null;
+        $info->language = $jsonInfo?->inLanguage ?? trim($document->xpath("//*[contains(text(), 'Język:')]")[0]->nextSibling('dd')->text());
+        $info->datePublished = $jsonInfo?->datePublished ?? null;

        return $info;
    }