Removed time limit, modified css queries to avoid cases when books page is missing some informations, added a selector for book category.

This commit is contained in:
krzysiej
2018-04-11 15:12:38 +02:00
parent db1aed0ee3
commit f016f0e4f2

View File

@@ -3,6 +3,7 @@ include 'vendor/autoload.php';
use DiDom\Document;
set_time_limit(-1);
$settings = include 'settings.php';
$loginData = [
@@ -47,19 +48,18 @@ function c($url, $post = [])
function getBookInfo($bookUrl)
{
// $bookPage = c($bookUrl);
print_r($bookUrl . "\n");
$bookData = [];
$bookPage = new Document($bookUrl, true);
$bookData['datepublished'] = $bookPage->first('.book-top-block-info-authors')->first('time[itemprop="datePublished"]::attr(datetime)');
$bookData['datepublished'] = $bookPage->first('.book-top-block-info-authors time[itemprop="datePublished"]::attr(datetime)');
$bookData['numberofpages'] = $bookPage->first('span[itemprop="numberOfPages"]::text');
$bookData['isbn'] = $bookPage->first('span[itemprop="isbn"]::text');
$bookData['reviewCount'] = $bookPage->first('meta[itemprop="reviewCount"]::attr(content)');
$bookData['ratingValue'] = $bookPage->first('meta[itemprop="ratingValue"]::attr(content)');
$bookData['toc'] = [];
foreach ($bookPage->first('#book-info-toc.onlyDesktop')->find('.book-toc-chapter') as $chapter) {
foreach ($bookPage->find('#book-info-toc.onlyDesktop .book-toc-chapter') as $chapter) {
if (!is_null($chapter->first('div[class*="book-toc-chapter-title"]'))) {
$tocSection = [];
$tocSection['title'] = trim($chapter->first('div[class*="book-toc-chapter-title"]')->text());
$tocSection['subchapters'] = $chapter->find('div[class*="book-toc-section-text"]::text');
@@ -67,8 +67,10 @@ function getBookInfo($bookUrl)
}
}
$bookData['description'] = $bookPage->first('div.book-info-bottom-indetail-text[itemprop="description"]')->find('p::text');
$bookData['willLearn'] = $bookPage->first('div.book-info-will-learn-text')->find('li::text');
$bookData['description'] = $bookPage->find('div.book-info-bottom-indetail-text[itemprop="description"] p::text');
$bookData['willLearn'] = $bookPage->find('div.book-info-will-learn-text li::text');
$bookData['category'] = $bookPage->first('div[data-product-id="' . $bookData['isbn'] . '"]::attr(data-product-category)');
$bookData['authors'] = [];
foreach ($bookPage->find('[itemprop="author"]') as $author) {
@@ -92,9 +94,13 @@ $document = new Document('packt.html', true);
$booksData = [];
$books = $document->find('.product-line.unseen');
$books = array_slice($books, 0, 1);
//$books = array_slice($books, 2, 1);
$bookData = json_decode('books.txt', 1);
foreach ($books as $book) {
// print_r($book->html());
$bookData = [];
$bookData['title'] = str_replace(["\r\n"], '', trim($book->first('.title::text')));
@@ -109,12 +115,11 @@ foreach ($books as $book) {
$bookData['url'] = $book->first('div[class*=product-thumbnail]')->first('a::attr(href)');
$bookData['info'] = getBookInfo($bookData['url']);
$booksData[] = $bookData;
$booksData[$bookData['nid']] = $bookData;
}
print_r($booksData);
file_put_contents('books.txt', json_encode($booksData));
//print_r($return);