Skip to content

Commit

Permalink
Fixed bug where 1x1 tracking pixels could be selected as cover images
Browse files Browse the repository at this point in the history
  • Loading branch information
spacecowboy committed Jul 18, 2023
1 parent ce99c46 commit 7b2c28e
Show file tree
Hide file tree
Showing 7 changed files with 57 additions and 50 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,9 @@ class FeedParser(override val di: DI) : DIAware {
html: String,
baseUrl: URL? = null,
): String? {
val doc = Jsoup.parse(html.byteInputStream(), "UTF-8", "")
val doc = html.byteInputStream().use {
Jsoup.parse(it, "UTF-8", baseUrl?.toString() ?: "")
}

return (
doc.getElementsByAttributeValue("rel", "apple-touch-icon") +
Expand All @@ -100,7 +102,9 @@ class FeedParser(override val di: DI) : DIAware {
html: String,
baseUrl: URL? = null,
): List<AlternateLink> {
val doc = Jsoup.parse(html.byteInputStream(), "UTF-8", "")
val doc = html.byteInputStream().use {
Jsoup.parse(it, "UTF-8", "")
}

val feeds = doc.getElementsByAttributeValue("rel", "alternate")
?.filter { it.hasAttr("href") && it.hasAttr("type") }
Expand Down
28 changes: 18 additions & 10 deletions app/src/main/java/com/nononsenseapps/feeder/util/HtmlUtils.kt
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,27 @@ import android.widget.Toast
import androidx.annotation.ColorInt
import androidx.browser.customtabs.CustomTabsIntent
import com.nononsenseapps.feeder.R
import org.jsoup.Jsoup
import org.jsoup.parser.Parser.unescapeEntities

// Using negative lookahead to skip data: urls, being inline base64
// And capturing original quote to use as ending quote
private val regexImgSrc = """img.*?src=(["'])((?!data).*?)\1""".toRegex(RegexOption.DOT_MATCHES_ALL)

fun naiveFindImageLink(text: String?): String? =
fun findFirstImageLinkInHtml(text: String?, baseUrl: String?): String? =
if (text != null) {
val imgLink = regexImgSrc.find(text)?.groupValues?.get(2)
if (imgLink?.contains("twitter_icon", ignoreCase = true) == true) {
null
} else {
imgLink
val doc = unescapeEntities(text, true).byteInputStream().use {
Jsoup.parse(it, "UTF-8", baseUrl ?: "")
}

doc.getElementsByTag("img").asSequence()
.filterNot { it.attr("width") == "1" || it.attr("height") == "1" }
.map {
// abs: will resolve relative urls against the baseurl - and non-url value will get
// dropped, such as invalid values and data/base64 values
it.attr("abs:src")
}
.firstOrNull {
it.isNotBlank()
&& !it.contains("twitter_icon", ignoreCase = true)
&& !it.contains("facebook_icon", ignoreCase = true)
}
} else {
null
}
Expand Down
20 changes: 2 additions & 18 deletions app/src/main/java/com/nononsenseapps/feeder/util/RomeExtensions.kt
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ import com.rometools.rome.feed.synd.SyndEntry
import com.rometools.rome.feed.synd.SyndFeed
import com.rometools.rome.feed.synd.SyndPerson
import java.net.URL
import org.jsoup.parser.Parser.unescapeEntities
import org.threeten.bp.Instant
import org.threeten.bp.ZoneOffset
import org.threeten.bp.ZonedDateTime
Expand Down Expand Up @@ -241,24 +240,9 @@ fun SyndEntry.thumbnail(feedBaseUrl: URL): String? {
return when {
thumbnail != null -> relativeLinkIntoAbsolute(feedBaseUrl, thumbnail.url)
else -> {
val imgLink: String? =
naiveFindImageLink(this.contentHtml())?.let { unescapeEntities(it, true) }
// Now we are resolving against original, not the feed
val siteBaseUrl: String? = this.linkToHtml(feedBaseUrl)

try {
when {
siteBaseUrl != null && imgLink != null -> relativeLinkIntoAbsolute(
URL(siteBaseUrl),
imgLink,
)
imgLink != null -> relativeLinkIntoAbsolute(feedBaseUrl, imgLink)
else -> null
}
} catch (t: Throwable) {
Log.e("FeederRomeExt", "Encountered some bad link: [$siteBaseUrl, $feedBaseUrl]", t)
null
}
val baseUrl: String = this.linkToHtml(feedBaseUrl) ?: feedBaseUrl.toString()
findFirstImageLinkInHtml(this.contentHtml(), baseUrl)
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,6 @@ class FeedParserClientTest : DIAware {
val url = server.url("/foo").toUrl()
// This should not crash
val result = feedParser.parseFeedUrl(url)
assertNull(result?.items?.first()?.image)
assertEquals("http://www.questionablecontent.net/comics/4776.png", result?.items?.first()?.image)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,6 @@ class FeedParserTest : DIAware {
}

@Test
@Throws(Exception::class)
fun nixos() = runBlocking {
val feed = nixosRss.use { feedParser.parseFeedResponse(it) }
assertNotNull(feed)
Expand Down
28 changes: 10 additions & 18 deletions app/src/test/java/com/nononsenseapps/feeder/util/HtmlUtilsKtTest.kt
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,21 @@ class HtmlUtilsKtTest {
ANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4
//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU
5ErkJggg==" alt="Red dot" /&gt;</summary>"""
assertNull(naiveFindImageLink(text))
assertNull(findFirstImageLinkInHtml(text, null))
}

@Test
fun ignoresBase64InlineImagesSingleLine() {
val text = """<summary type="html">&lt;img src="" alt="Red dot" /&gt;</summary>""".trimMargin()
assertNull(naiveFindImageLink(text))
assertNull(findFirstImageLinkInHtml(text, null))
}

@Test
fun findsImageInSingleLine() {
val text = "<summary type=\"html\">&lt;img src=\"https://imgs.xkcd.com/comics/interstellar_asteroid.png\" title=\"Every time we detect an asteroid from outside the Solar System, we should immediately launch a mission to fling one of our asteroids back in the direction it came from.\" alt=\"Every time we detect an asteroid from outside the Solar System, we should immediately launch a mission to fling one of our asteroids back in the direction it came from.\" /&gt;</summary>"
assertEquals(
"https://imgs.xkcd.com/comics/interstellar_asteroid.png",
naiveFindImageLink(text)
findFirstImageLinkInHtml(text, null)
)
}

Expand All @@ -35,36 +35,28 @@ ANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4
val text = "<summary type=\"html\">&lt;img title=\"Every time we detect an asteroid from outside the Solar System, we should immediately launch a mission to fling one of our asteroids back in the direction it came from.\" alt=\"Every time we detect an asteroid from outside the Solar System, we should immediately launch a mission to fling one of our asteroids back in the direction it came from.\" src=\"https://imgs.xkcd.com/comics/interstellar_asteroid.png\" /&gt;</summary>"
assertEquals(
"https://imgs.xkcd.com/comics/interstellar_asteroid.png",
naiveFindImageLink(text)
findFirstImageLinkInHtml(text, null)
)
}

@Test
fun returnsNullWhenNoImageLink() {
val text = "<summary type=\"html\">&lt;img title=\"Every time we detect an asteroid from outside the Solar System, we should immediately launch a mission to fling one of our asteroids back in the direction it came from.\" alt=\"Every time we detect an asteroid from outside the Solar System, we should immediately launch a mission to fling one of our asteroids back in the direction it came from.\" /&gt;</summary>"
assertNull(naiveFindImageLink(text))
assertNull(findFirstImageLinkInHtml(text, null))
}

@Test
fun returnsNullForNull() {
assertNull(naiveFindImageLink(null))
assertNull(findFirstImageLinkInHtml(null, null))
}

@Test
fun returnsNullFoTwitterIcons() {
assertNull(naiveFindImageLink("<description>There's a tiny, black-freckled toad that likes the water in hot springs. Unfortunately, the only place in the world where the species is found is on 760 acres of wetlands about 100 miles east of Reno, Nevada, according to the New York Times. And that's near the site for two renewable-energy geothermal plants which poses \"significant risk to the well-being of the species,\" according to America's Fish and Wildlife Service &mdash; which just announced an emergency measure declaring it an endangered species. The temporary protection, which went into effect immediately and lasts for 240 days, was imposed to ward off the toad's potential extinction, the U.S. Fish and Wildlife Service said in a statement, adding that it would consider public comments about whether to extend the toad's emergency listing. The designation would add another hurdle for a plan to build two power plants with the encouragement of the U.S. Bureau of Land Management. The project is already the subject of a lawsuit filed by conservationists and a nearby Native American tribe. They hope the emergency listing can be used to block construction, which recently resumed.... The suit contended that the geothermal plants would dry up nearby hot springs sacred to the tribe and wipe out the Dixie Valley toad species. The U.S. Fish and Wildlife Service argues that \"protecting small population species like this ensures the continued biodiversity necessary to maintain climate-resilient landscapes in one of the driest states in the country.\" They were only recently scientifically described &mdash; or declared a unique species &mdash; in 2017, making the Dixie Valley toad \"&gt;the first new toad species to be described in the U.S. in nearly 50 years. And they are truly unique. When they were described, scientists analyzed 14 different morphological characteristics like size, shape, and markings. Dixie Valley toads scored \"significantly different\" from other western toad species in all categories. Thanks to long-time Slashdot reader walterbyrd for sharing the link!<p><div class=\"share_submission\" style=\"position:relative;\"> <a class=\"slashpop\" href=\"http://twitter.com/home?status=How+A+Tiny+Toad+Could+Upend+a+US+Geothermal+Project%3A+https%3A%2F%2Fbit.ly%2F3U26QW8\"><img src=\"https://a.fsdn.com/sd/twitter_icon_large.png\"></a> <a class=\"slashpop\" href=\"http://www.facebook.com/sharer.php?u=https%3A%2F%2Fhardware.slashdot.org%2Fstory%2F22%2F09%2F11%2F1931213%2Fhow-a-tiny-toad-could-upend-a-us-geothermal-project%3Futm_source%3Dslashdot%26utm_medium%3Dfacebook\"><img src=\"https://a.fsdn.com/sd/facebook_icon_large.png\"></a> </div></p><p><a href=\"https://hardware.slashdot.org/story/22/09/11/1931213/how-a-tiny-toad-could-upend-a-us-geothermal-project?utm_source=rss1.0moreanon&amp;utm_medium=feed\">Read more of this story</a> at Slashdot.</p><iframe src=\"https://slashdot.org/slashdot-it.pl?op=discuss&amp;id=22035223&amp;smallembed=1\" style=\"height: 300px; width: 100%; border: none;\"></iframe></description>"))
fun returnsNullForTwitterAndFacebookIcons() {
assertNull(findFirstImageLinkInHtml("<description>There's a tiny, black-freckled toad that likes the water in hot springs. Unfortunately, the only place in the world where the species is found is on 760 acres of wetlands about 100 miles east of Reno, Nevada, according to the New York Times. And that's near the site for two renewable-energy geothermal plants which poses \"significant risk to the well-being of the species,\" according to America's Fish and Wildlife Service &mdash; which just announced an emergency measure declaring it an endangered species. The temporary protection, which went into effect immediately and lasts for 240 days, was imposed to ward off the toad's potential extinction, the U.S. Fish and Wildlife Service said in a statement, adding that it would consider public comments about whether to extend the toad's emergency listing. The designation would add another hurdle for a plan to build two power plants with the encouragement of the U.S. Bureau of Land Management. The project is already the subject of a lawsuit filed by conservationists and a nearby Native American tribe. They hope the emergency listing can be used to block construction, which recently resumed.... The suit contended that the geothermal plants would dry up nearby hot springs sacred to the tribe and wipe out the Dixie Valley toad species. The U.S. Fish and Wildlife Service argues that \"protecting small population species like this ensures the continued biodiversity necessary to maintain climate-resilient landscapes in one of the driest states in the country.\" They were only recently scientifically described &mdash; or declared a unique species &mdash; in 2017, making the Dixie Valley toad \"&gt;the first new toad species to be described in the U.S. in nearly 50 years. And they are truly unique. When they were described, scientists analyzed 14 different morphological characteristics like size, shape, and markings. Dixie Valley toads scored \"significantly different\" from other western toad species in all categories. Thanks to long-time Slashdot reader walterbyrd for sharing the link!<p><div class=\"share_submission\" style=\"position:relative;\"> <a class=\"slashpop\" href=\"http://twitter.com/home?status=How+A+Tiny+Toad+Could+Upend+a+US+Geothermal+Project%3A+https%3A%2F%2Fbit.ly%2F3U26QW8\"><img src=\"https://a.fsdn.com/sd/twitter_icon_large.png\"></a> <a class=\"slashpop\" href=\"http://www.facebook.com/sharer.php?u=https%3A%2F%2Fhardware.slashdot.org%2Fstory%2F22%2F09%2F11%2F1931213%2Fhow-a-tiny-toad-could-upend-a-us-geothermal-project%3Futm_source%3Dslashdot%26utm_medium%3Dfacebook\"><img src=\"https://a.fsdn.com/sd/facebook_icon_large.png\"></a> </div></p><p><a href=\"https://hardware.slashdot.org/story/22/09/11/1931213/how-a-tiny-toad-could-upend-a-us-geothermal-project?utm_source=rss1.0moreanon&amp;utm_medium=feed\">Read more of this story</a> at Slashdot.</p><iframe src=\"https://slashdot.org/slashdot-it.pl?op=discuss&amp;id=22035223&amp;smallembed=1\" style=\"height: 300px; width: 100%; border: none;\"></iframe></description>", null))
}

@Test
fun correctlyMatchesKindOfQuote() {
assertEquals(
"""a'quote""",
naiveFindImageLink("""<img src="a'quote">""")
)

assertEquals(
"""a"quote""",
naiveFindImageLink("""<img src='a"quote'>""")
)
fun returnsNullForTrackingPixels() {
assertNull(findFirstImageLinkInHtml("""<description>asdfasd <img src="https://foo/track.png" width="1" height="1"> asdf asd</description>""", null))
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -489,6 +489,24 @@ class RomeExtensionsKtTest {
)
}

@Test
fun thumbnailFromEnclosureIsFound() {
val item = mockSyndEntry(
uri = "id",
enclosures = listOf(
mockSyndEnclosure(
url = "http://foo/bar.png",
type = "image/png",
)
),
).asItem(baseUrl)

assertEquals(
"http://foo/bar.png",
item.image,
)
}

@Test
fun publishedRFC3339Date() {
// Need to convert it so timezone is correct for test
Expand Down Expand Up @@ -617,6 +635,7 @@ class RomeExtensionsKtTest {
mediaContents: Array<MediaContent>? = null,
title: String? = null,
titleEx: SyndContent? = null,
enclosures: List<SyndEnclosure> = emptyList(),
): SyndEntry {
val mock = mock(SyndEntry::class.java)

Expand All @@ -632,6 +651,7 @@ class RomeExtensionsKtTest {
`when`(mock.contents).thenReturn(contents)
`when`(mock.publishedDate).thenReturn(publishedDate)
`when`(mock.updatedDate).thenReturn(updatedDate)
`when`(mock.enclosures).thenReturn(enclosures)

val mockMedia = mock(MediaEntryModule::class.java)
val mockMetadata = mock(com.rometools.modules.mediarss.types.Metadata::class.java)
Expand Down

0 comments on commit 7b2c28e

Please sign in to comment.