Removed time limit, modified css queries to avoid cases when books page is missing some informations, added a selector for book category.
This commit is contained in:
25
packt.php
25
packt.php
@@ -3,6 +3,7 @@ include 'vendor/autoload.php';
|
|||||||
|
|
||||||
use DiDom\Document;
|
use DiDom\Document;
|
||||||
|
|
||||||
|
set_time_limit(-1);
|
||||||
$settings = include 'settings.php';
|
$settings = include 'settings.php';
|
||||||
|
|
||||||
$loginData = [
|
$loginData = [
|
||||||
@@ -47,19 +48,18 @@ function c($url, $post = [])
|
|||||||
function getBookInfo($bookUrl)
|
function getBookInfo($bookUrl)
|
||||||
{
|
{
|
||||||
|
|
||||||
// $bookPage = c($bookUrl);
|
print_r($bookUrl . "\n");
|
||||||
$bookData = [];
|
$bookData = [];
|
||||||
$bookPage = new Document($bookUrl, true);
|
$bookPage = new Document($bookUrl, true);
|
||||||
$bookData['datepublished'] = $bookPage->first('.book-top-block-info-authors')->first('time[itemprop="datePublished"]::attr(datetime)');
|
$bookData['datepublished'] = $bookPage->first('.book-top-block-info-authors time[itemprop="datePublished"]::attr(datetime)');
|
||||||
$bookData['numberofpages'] = $bookPage->first('span[itemprop="numberOfPages"]::text');
|
$bookData['numberofpages'] = $bookPage->first('span[itemprop="numberOfPages"]::text');
|
||||||
$bookData['isbn'] = $bookPage->first('span[itemprop="isbn"]::text');
|
$bookData['isbn'] = $bookPage->first('span[itemprop="isbn"]::text');
|
||||||
$bookData['reviewCount'] = $bookPage->first('meta[itemprop="reviewCount"]::attr(content)');
|
$bookData['reviewCount'] = $bookPage->first('meta[itemprop="reviewCount"]::attr(content)');
|
||||||
$bookData['ratingValue'] = $bookPage->first('meta[itemprop="ratingValue"]::attr(content)');
|
$bookData['ratingValue'] = $bookPage->first('meta[itemprop="ratingValue"]::attr(content)');
|
||||||
|
|
||||||
$bookData['toc'] = [];
|
$bookData['toc'] = [];
|
||||||
foreach ($bookPage->first('#book-info-toc.onlyDesktop')->find('.book-toc-chapter') as $chapter) {
|
foreach ($bookPage->find('#book-info-toc.onlyDesktop .book-toc-chapter') as $chapter) {
|
||||||
if (!is_null($chapter->first('div[class*="book-toc-chapter-title"]'))) {
|
if (!is_null($chapter->first('div[class*="book-toc-chapter-title"]'))) {
|
||||||
|
|
||||||
$tocSection = [];
|
$tocSection = [];
|
||||||
$tocSection['title'] = trim($chapter->first('div[class*="book-toc-chapter-title"]')->text());
|
$tocSection['title'] = trim($chapter->first('div[class*="book-toc-chapter-title"]')->text());
|
||||||
$tocSection['subchapters'] = $chapter->find('div[class*="book-toc-section-text"]::text');
|
$tocSection['subchapters'] = $chapter->find('div[class*="book-toc-section-text"]::text');
|
||||||
@@ -67,8 +67,10 @@ function getBookInfo($bookUrl)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
$bookData['description'] = $bookPage->first('div.book-info-bottom-indetail-text[itemprop="description"]')->find('p::text');
|
$bookData['description'] = $bookPage->find('div.book-info-bottom-indetail-text[itemprop="description"] p::text');
|
||||||
$bookData['willLearn'] = $bookPage->first('div.book-info-will-learn-text')->find('li::text');
|
$bookData['willLearn'] = $bookPage->find('div.book-info-will-learn-text li::text');
|
||||||
|
|
||||||
|
$bookData['category'] = $bookPage->first('div[data-product-id="' . $bookData['isbn'] . '"]::attr(data-product-category)');
|
||||||
|
|
||||||
$bookData['authors'] = [];
|
$bookData['authors'] = [];
|
||||||
foreach ($bookPage->find('[itemprop="author"]') as $author) {
|
foreach ($bookPage->find('[itemprop="author"]') as $author) {
|
||||||
@@ -92,9 +94,13 @@ $document = new Document('packt.html', true);
|
|||||||
|
|
||||||
$booksData = [];
|
$booksData = [];
|
||||||
$books = $document->find('.product-line.unseen');
|
$books = $document->find('.product-line.unseen');
|
||||||
$books = array_slice($books, 0, 1);
|
//$books = array_slice($books, 2, 1);
|
||||||
|
|
||||||
|
$bookData = json_decode('books.txt', 1);
|
||||||
|
|
||||||
foreach ($books as $book) {
|
foreach ($books as $book) {
|
||||||
|
|
||||||
|
|
||||||
// print_r($book->html());
|
// print_r($book->html());
|
||||||
$bookData = [];
|
$bookData = [];
|
||||||
$bookData['title'] = str_replace(["\r\n"], '', trim($book->first('.title::text')));
|
$bookData['title'] = str_replace(["\r\n"], '', trim($book->first('.title::text')));
|
||||||
@@ -109,12 +115,11 @@ foreach ($books as $book) {
|
|||||||
$bookData['url'] = $book->first('div[class*=product-thumbnail]')->first('a::attr(href)');
|
$bookData['url'] = $book->first('div[class*=product-thumbnail]')->first('a::attr(href)');
|
||||||
$bookData['info'] = getBookInfo($bookData['url']);
|
$bookData['info'] = getBookInfo($bookData['url']);
|
||||||
|
|
||||||
|
$booksData[$bookData['nid']] = $bookData;
|
||||||
$booksData[] = $bookData;
|
|
||||||
|
|
||||||
}
|
}
|
||||||
print_r($booksData);
|
print_r($booksData);
|
||||||
|
|
||||||
|
file_put_contents('books.txt', json_encode($booksData));
|
||||||
|
|
||||||
|
|
||||||
//print_r($return);
|
//print_r($return);
|
||||||
|
|||||||
Reference in New Issue
Block a user