Skip to content

Commit

Permalink
Merge pull request #405 from leepeuker/improve-imdb-rating-extractor-…
Browse files Browse the repository at this point in the history
…stability

Bugfix: Improve imdb rating sync stability for not yet released movies
  • Loading branch information
leepeuker committed Jun 23, 2023
2 parents cf39895 + 9b880c6 commit a6e7324
Show file tree
Hide file tree
Showing 8 changed files with 59 additions and 16 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,4 @@ app_jobs_process:

# Shortcuts
php: exec_app_bash
test: composer_test
10 changes: 6 additions & 4 deletions docs/features/imdb-rating.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,20 @@ Movies without IMDb ratings or updated the longest time ago are prioritized.
php bin/console.php imdb:sync
```

### Important flags
### Interesting flags

- `--help`
Detailed information about the command
- `--hours`
Only sync movie ratings which were last synced at least X hours ago
Number of hours required to have elapsed since last sync
- `--threshold`
Maximum number of movie ratings to sync
Maximum number of movies to sync
- `--movieIds`
Comma separated string of movie ids to force sync for

### Example

Update ratings for the first 30 movies which were updated at least 24 hours ago
Update ratings for the first 30 movies which were not updated in the last 24 hours ago
```shell
php bin/console.php imdb:sync` --hours 24 --threshold 30
```
20 changes: 20 additions & 0 deletions src/Api/Imdb/ImdbWebScrapper.php
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,16 @@ public function findRating(string $imdbId) : ?ImdbRating
return null;
}

$productionStatus = $this->extractProductionStatus($imdbMovieRatingPage);
if ($productionStatus !== null) {
$this->logger->debug('IMDb: Ignoring not yet released movie', [
'url' => $this->urlGenerator->buildMovieUrl($imdbId),
'productionStatus' => $productionStatus,
]);

return null;
}

$ratingAverage = $this->extractRatingAverage($imdbMovieRatingPage, $imdbId);
if ($ratingAverage === null) {
return null;
Expand All @@ -48,6 +58,16 @@ public function findRating(string $imdbId) : ?ImdbRating
return $imdbRating;
}

private function extractProductionStatus(string $imdbRatingPage) : ?string
{
preg_match('~hjAonB">([^<]*)~', $imdbRatingPage, $productionStatus);
if (empty($productionStatus[1]) === true) {
return null;
}

return $productionStatus[1];
}

private function extractRatingAverage(string $imdbRatingPage, string $imdbId) : ?float
{
preg_match('/iZlgcd">(\d([.,])\d)/', $imdbRatingPage, $averageRatingMatches);
Expand Down
16 changes: 11 additions & 5 deletions src/Command/ImdbSync.php
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ class ImdbSync extends Command

private const OPTION_NAME_FORCE_THRESHOLD = 'threshold';

private const OPTION_NAME_MOVIE_IDS = 'movieIds';

protected static $defaultName = 'imdb:sync';

public function __construct(
Expand All @@ -30,9 +32,10 @@ public function __construct(
protected function configure() : void
{
$this
->setDescription('Sync imdb ratings for local movies.')
->addOption(self::OPTION_NAME_FORCE_THRESHOLD, 'threshold', InputOption::VALUE_REQUIRED, 'Max number of movies to sync.')
->addOption(self::OPTION_NAME_FORCE_HOURS, 'hours', InputOption::VALUE_REQUIRED, 'Hours since last updated.');
->setDescription('Sync imdb ratings for local movies, sorted by how outdated they are (oldest first).')
->addOption(self::OPTION_NAME_MOVIE_IDS, 'movieIds', InputOption::VALUE_REQUIRED, 'Comma separated string of movie ids to force sync.')
->addOption(self::OPTION_NAME_FORCE_THRESHOLD, 'threshold', InputOption::VALUE_REQUIRED, 'Maximum number of movies to sync.')
->addOption(self::OPTION_NAME_FORCE_HOURS, 'hours', InputOption::VALUE_REQUIRED, 'Number of hours required to have elapsed since last sync.');
}

protected function execute(InputInterface $input, OutputInterface $output) : int
Expand All @@ -41,14 +44,17 @@ protected function execute(InputInterface $input, OutputInterface $output) : int
$maxAgeInHours = $hoursOption !== null ? (int)$hoursOption : null;

$thresholdOption = $input->getOption(self::OPTION_NAME_FORCE_THRESHOLD);
$movieCountSyncThreshold = $thresholdOption !== null ? (int)$thresholdOption : null;
$movieCountSyncThreshold = (int)$thresholdOption !== 0 ? (int)$thresholdOption : null;

$movieIdsOption = $input->getOption(self::OPTION_NAME_MOVIE_IDS);
$movieIds = (string)$movieIdsOption !== '' ? array_map('intval', explode(',', $movieIdsOption)) : null;

$jobId = $this->jobQueueApi->addImdbSyncJob(JobStatus::createInProgress());

try {
$this->generateOutput($output, 'Syncing imdb movie ratings...');

$this->imdbMovieRatingSync->syncMultipleMovieRatings($maxAgeInHours, $movieCountSyncThreshold);
$this->imdbMovieRatingSync->syncMultipleMovieRatings($maxAgeInHours, $movieCountSyncThreshold, $movieIds);

$this->jobQueueApi->updateJobStatus($jobId, JobStatus::createDone());

Expand Down
4 changes: 2 additions & 2 deletions src/Domain/Movie/MovieApi.php
Original file line number Diff line number Diff line change
Expand Up @@ -162,9 +162,9 @@ public function fetchHistoryOrderedByWatchedAtDesc(int $userId) : array
return $this->historyApi->fetchHistoryOrderedByWatchedAtDesc($userId);
}

public function fetchMovieIdsHavingImdbIdOrderedByLastImdbUpdatedAt(?int $maxAgeInHours = null, ?int $limit = null) : array
public function fetchMovieIdsHavingImdbIdOrderedByLastImdbUpdatedAt(?int $maxAgeInHours = null, ?int $limit = null, ?array $filterMovieIds = null) : array
{
return $this->movieRepository->fetchMovieIdsHavingImdbIdOrderedByLastImdbUpdatedAt($maxAgeInHours, $limit);
return $this->movieRepository->fetchMovieIdsHavingImdbIdOrderedByLastImdbUpdatedAt($maxAgeInHours, $limit, $filterMovieIds);
}

public function fetchUniqueMovieGenres(int $userId) : array
Expand Down
11 changes: 8 additions & 3 deletions src/Domain/Movie/MovieRepository.php
Original file line number Diff line number Diff line change
Expand Up @@ -424,26 +424,31 @@ public function fetchMostWatchedReleaseYears(int $userId) : array
);
}

public function fetchMovieIdsHavingImdbIdOrderedByLastImdbUpdatedAt(?int $maxAgeInHours = null, ?int $limit = null) : array
public function fetchMovieIdsHavingImdbIdOrderedByLastImdbUpdatedAt(?int $maxAgeInHours = null, ?int $limit = null, ?array $filterMovieIds = null) : array
{
$limitQuery = '';
if ($limit !== null) {
$limitQuery = " LIMIT $limit";
}

$filterMovieIdsQuery = '';
if ($filterMovieIds !== null) {
$filterMovieIdsQuery = ' AND movie.id IN (' . implode(',', $filterMovieIds) . ')';
}

if ($this->dbConnection->getDatabasePlatform() instanceof SqlitePlatform) {
return $this->dbConnection->fetchFirstColumn(
'SELECT movie.id
FROM `movie`
WHERE movie.imdb_id IS NOT NULL AND (updated_at_imdb IS NULL OR updated_at_imdb <= datetime("now","-' . $maxAgeInHours . ' hours"))
WHERE movie.imdb_id IS NOT NULL AND (updated_at_imdb IS NULL OR updated_at_imdb <= datetime("now","-' . $maxAgeInHours . ' hours"))' . $filterMovieIdsQuery . '
ORDER BY updated_at_imdb ASC' . $limitQuery,
);
}

return $this->dbConnection->fetchFirstColumn(
'SELECT movie.id
FROM `movie`
WHERE movie.imdb_id IS NOT NULL AND (updated_at_imdb IS NULL OR updated_at_imdb <= DATE_SUB(NOW(), INTERVAL ? HOUR))
WHERE movie.imdb_id IS NOT NULL AND (updated_at_imdb IS NULL OR updated_at_imdb <= DATE_SUB(NOW(), INTERVAL ? HOUR))' . $filterMovieIdsQuery . '
ORDER BY updated_at_imdb ASC' . $limitQuery,
[(int)$maxAgeInHours],
);
Expand Down
7 changes: 5 additions & 2 deletions src/Service/Imdb/ImdbMovieRatingSync.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
use Movary\Api\Imdb\ImdbWebScrapper;
use Movary\Domain\Movie\MovieApi;
use Movary\Domain\Movie\MovieEntity;
use Movary\ValueObject\DateTime;
use Movary\ValueObject\ImdbRating;
use Psr\Log\LoggerInterface;

Expand Down Expand Up @@ -44,6 +45,7 @@ public function syncMovieRating(int $movieId) : void
$this->logger->debug('IMDb: Skipped updating not changed movie rating', [$this->generateMovieLogData($movie)]);

$this->movieApi->updateImdbTimestamp($movieId);

return;
}

Expand All @@ -68,14 +70,15 @@ public function syncMovieRating(int $movieId) : void
public function syncMultipleMovieRatings(
?int $maxAgeInHours = null,
?int $movieCountSyncThreshold = null,
array $movieIds = null,
int $minDelayBetweenRequests = self::DEFAULT_MIN_DELAY_BETWEEN_REQUESTS_IN_MS,
) : void {
$movieIds = $this->movieApi->fetchMovieIdsHavingImdbIdOrderedByLastImdbUpdatedAt($maxAgeInHours, $movieCountSyncThreshold);
$movieIds = $this->movieApi->fetchMovieIdsHavingImdbIdOrderedByLastImdbUpdatedAt($maxAgeInHours, $movieCountSyncThreshold, $movieIds);

foreach ($movieIds as $index => $movieId) {
$this->syncMovieRating($movieId);

if ($index === array_key_last($movieIds)) {
if ($index === array_key_last($movieIds) || ((int)$movieCountSyncThreshold !== 0 && (int)$index + 1 >= $movieCountSyncThreshold)) {
break;
}

Expand Down
6 changes: 6 additions & 0 deletions tests/unit/Api/Imdb/ImdbWebScrapperTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,12 @@ public function provideFindRatingData() : array
bjjENQ">229.240</div>',
ImdbRating::create(7.9, 229240)
],
'returns no rating if current production status is found' => [
'hjAonB">Post-production
iZlgcd">7.9</span>
bjjENQ">229.240</div>',
null,
],
[
'iZlgcd">7,9</span>
bjjENQ">229,240</div>',
Expand Down

0 comments on commit a6e7324

Please sign in to comment.