From f016f0e4f2ba8151484ac0b6273e38713da3b4c1 Mon Sep 17 00:00:00 2001 From: krzysiej Date: Wed, 11 Apr 2018 15:12:38 +0200 Subject: [PATCH] Removed time limit, modified css queries to avoid cases when books page is missing some informations, added a selector for book category. --- packt.php | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/packt.php b/packt.php index 5c2c13b..3128439 100644 --- a/packt.php +++ b/packt.php @@ -3,6 +3,7 @@ include 'vendor/autoload.php'; use DiDom\Document; +set_time_limit(-1); $settings = include 'settings.php'; $loginData = [ @@ -47,19 +48,18 @@ function c($url, $post = []) function getBookInfo($bookUrl) { -// $bookPage = c($bookUrl); + print_r($bookUrl . "\n"); $bookData = []; $bookPage = new Document($bookUrl, true); - $bookData['datepublished'] = $bookPage->first('.book-top-block-info-authors')->first('time[itemprop="datePublished"]::attr(datetime)'); + $bookData['datepublished'] = $bookPage->first('.book-top-block-info-authors time[itemprop="datePublished"]::attr(datetime)'); $bookData['numberofpages'] = $bookPage->first('span[itemprop="numberOfPages"]::text'); $bookData['isbn'] = $bookPage->first('span[itemprop="isbn"]::text'); $bookData['reviewCount'] = $bookPage->first('meta[itemprop="reviewCount"]::attr(content)'); $bookData['ratingValue'] = $bookPage->first('meta[itemprop="ratingValue"]::attr(content)'); $bookData['toc'] = []; - foreach ($bookPage->first('#book-info-toc.onlyDesktop')->find('.book-toc-chapter') as $chapter) { + foreach ($bookPage->find('#book-info-toc.onlyDesktop .book-toc-chapter') as $chapter) { if (!is_null($chapter->first('div[class*="book-toc-chapter-title"]'))) { - $tocSection = []; $tocSection['title'] = trim($chapter->first('div[class*="book-toc-chapter-title"]')->text()); $tocSection['subchapters'] = $chapter->find('div[class*="book-toc-section-text"]::text'); @@ -67,8 +67,10 @@ function getBookInfo($bookUrl) } } - $bookData['description'] = $bookPage->first('div.book-info-bottom-indetail-text[itemprop="description"]')->find('p::text'); - $bookData['willLearn'] = $bookPage->first('div.book-info-will-learn-text')->find('li::text'); + $bookData['description'] = $bookPage->find('div.book-info-bottom-indetail-text[itemprop="description"] p::text'); + $bookData['willLearn'] = $bookPage->find('div.book-info-will-learn-text li::text'); + + $bookData['category'] = $bookPage->first('div[data-product-id="' . $bookData['isbn'] . '"]::attr(data-product-category)'); $bookData['authors'] = []; foreach ($bookPage->find('[itemprop="author"]') as $author) { @@ -92,9 +94,13 @@ $document = new Document('packt.html', true); $booksData = []; $books = $document->find('.product-line.unseen'); -$books = array_slice($books, 0, 1); +//$books = array_slice($books, 2, 1); + +$bookData = json_decode('books.txt', 1); + foreach ($books as $book) { + // print_r($book->html()); $bookData = []; $bookData['title'] = str_replace(["\r\n"], '', trim($book->first('.title::text'))); @@ -109,12 +115,11 @@ foreach ($books as $book) { $bookData['url'] = $book->first('div[class*=product-thumbnail]')->first('a::attr(href)'); $bookData['info'] = getBookInfo($bookData['url']); - - $booksData[] = $bookData; - + $booksData[$bookData['nid']] = $bookData; } print_r($booksData); +file_put_contents('books.txt', json_encode($booksData)); //print_r($return);