feat(refresh): implement if-modified-since detection to only refresh newer content

Also autoformatted with PHP Tools on VS Code
2026-06-18 17:55:14 +00:00 · 2024-12-02 22:25:36 -06:00 · 2024-12-02 22:25:36 -06:00 · d583632c48
commit d583632c48
parent ee7e6558b2
1 changed files with 106 additions and 49 deletions
--- a/libs/twtxt.php
+++ b/libs/twtxt.php
@ -1,4 +1,5 @@
 <?php
+
 declare(strict_types=1);

 $config = parse_ini_file('private/config.ini');
@ -9,7 +10,8 @@ if ($config['debug_mode']) {
 	error_reporting(E_ALL);
 }

-class TwtxtFile {
+class TwtxtFile
+{
 	public $mainURL = ''; // First found URL
 	public $URLs = [];
 	public $nick = '';
@ -22,7 +24,8 @@ class TwtxtFile {
 	public $twts = [];
 }

-class Twt {
+class Twt
+{
 	public $originalTwtStr;
 	public $hash;
 	public $timestamp;
@ -64,13 +67,14 @@ curl_setopt($curl, CURLOPT_SSLVERSION, 4);
 * found, the function returns the value of the key as a string after trimming any whitespace. If no
 * match is found, the function returns null.
 */
-function getSingleParameter($keyToFind, $string) {
+function getSingleParameter($keyToFind, $string)
+{
 	if (!str_contains($string, $keyToFind)) {
 		return null;
 	}

 	$pattern = '/\s*(?<!\S)' . $keyToFind . '\s*=\s*([^#\n]+)/';
-		// Fix: not machting with nick as in: `# follow = dbucklin@www.davebucklin.com https://www.davebucklin.com/twtxt.txt?nick=dbucklin`
+	// Fix: not machting with nick as in: `# follow = dbucklin@www.davebucklin.com https://www.davebucklin.com/twtxt.txt?nick=dbucklin`
 	//$pattern = '/\s*' . $keyToFind . '\s*=\s*([^#\n]+)/';
 	//$pattern = '/\s*' . $keyToFind . '\s*=\s*([^\s#]+)/'; // Only matches the first word
 	preg_match($pattern, $string, $matches);
@ -82,7 +86,8 @@ function getSingleParameter($keyToFind, $string) {
 	return null;
 }

-function getDoubleParameter($keywordToFind, $string) {
+function getDoubleParameter($keywordToFind, $string)
+{
 	// Returns string or null
 	$pattern = '/#\s*' . preg_quote($keywordToFind, '/') . '\s*=\s*(\S+)\s*(\S+)/';
 	// Matches "# <keyword> = <value> <value>"
@ -96,7 +101,8 @@ function getDoubleParameter($keywordToFind, $string) {
 	return null;
 }

-function getReplyHashFromTwt(string $twtString): string {
+function getReplyHashFromTwt(string $twtString): string
+{
 	// Extract the text between parentheses using regular expressions
 	$pattern = '/\(#([^\)]+)\)/'; // Matches "(#<text>)"
 	preg_match($pattern, $twtString, $matches);
@ -109,7 +115,8 @@ function getReplyHashFromTwt(string $twtString): string {
 	return '';
 }

-function getImagesFromTwt(string $twtString) {
+function getImagesFromTwt(string $twtString)
+{
 	$pattern = '/(<img[^>]+>)/i';
 	preg_match_all($pattern, $twtString, $matches, PREG_SET_ORDER);

@ -122,7 +129,8 @@ function getImagesFromTwt(string $twtString) {
 	return $result;
 }

-function getTagsFromTwt(string $twtString) {
+function getTagsFromTwt(string $twtString)
+{
 	//$pattern = '/(?<!\()\B#\w+(?!\))/iu';
 	//$pattern = '/(?<=\B)#(\w+)/';
 	$pattern = '/(?<=\B)#([\p{L}\p{N}_]+)/u';
@ -144,7 +152,8 @@ function getTagsFromTwt(string $twtString) {
 	return $result;
 }

-function getMentionsFromTwt(string $twtString) {
+function getMentionsFromTwt(string $twtString)
+{
 	$pattern = '/@<([^>]+)\s([^>]+)>/'; // Matches "@<nick url>"
 	preg_match_all($pattern, $twtString, $matches, PREG_SET_ORDER);

@ -159,13 +168,14 @@ function getMentionsFromTwt(string $twtString) {
 	return $result;
 }

-function replaceMentionsFromTwt(string $twtString): string {
+function replaceMentionsFromTwt(string $twtString): string
+{
 	// Example input: 'Hello @<eapl.mx https://eapl.mx/twtxt.txt>, how are you? @<nick https://server.com/something/twtxt.txt>';
 	// Example output: Hello <a href="?url=https://eapl.mx/twtxt.txt">@eapl.mx@eapl.mx/twtxt.txt</a>, how are you? <a href="?url=https://server.com/something/twtxt.txt">@nick@server.com/something/twtxt.txt</a>

 	$pattern = '/@<([^ ]+)\s([^>]+)>/';
 	//$replacement = '<a href="/?url=$2">@$1</a>';
-	$replacement = '<a href="'.str_replace("/index.php", "", $_SERVER["SCRIPT_NAME"]).'/profile?url=$2">@$1</a>';
+	$replacement = '<a href="' . str_replace("/index.php", "", $_SERVER["SCRIPT_NAME"]) . '/profile?url=$2">@$1</a>';
 	$replacement .= '<a href="$2" class="webmention"></a>'; // Adds a hidden link direcly to the twtxt.txt of the mentioned target
 	#$twtString = '@<nick https://eapl.mx/twtxt.txt>';
 	#$pattern = '/@<([^ ]+) ([^>]+)>/';
@ -175,10 +185,11 @@ function replaceMentionsFromTwt(string $twtString): string {

 	// from https://github.com/hxii/picoblog/blob/master/picoblog.php
 	//$pattern = '/\@<([a-zA-Z0-9\.]+)\W+(https?:\/\/[^>]+)>/';
-    //return preg_replace($pattern,'<a href="$2">@$1</a>',$twtString);
+	//return preg_replace($pattern,'<a href="$2">@$1</a>',$twtString);
 }

-function replaceLinksFromTwt(string $twtString) {
+function replaceLinksFromTwt(string $twtString)
+{

 	// TODO: Make this NOT match with `inline code` to avoid links in code-snippets
 	// 1. Look into how yarnd handles this
@ -194,7 +205,8 @@ function replaceLinksFromTwt(string $twtString) {
 	return $result;
 }

-function replaceMarkdownLinksFromTwt(string $twtString) {
+function replaceMarkdownLinksFromTwt(string $twtString)
+{
 	$pattern = '/\[([^\]]+)\]\(([^)]+)\)/';

 	$replacement = '<a href="$2">$1</a>';
@ -203,7 +215,8 @@ function replaceMarkdownLinksFromTwt(string $twtString) {
 	return $result;
 }

-function replaceImagesFromTwt(string $twtString) {
+function replaceImagesFromTwt(string $twtString)
+{
 	$pattern = '/!\[(.*?)\]\((.*?)\)/';
 	//$replacement = '<img src="$2" alt="$1">';
 	$replacement = '<a href="$2"><img src="$2" alt="$1"></a>';
@ -212,11 +225,12 @@ function replaceImagesFromTwt(string $twtString) {
 	return $result;
 }

-function replaceTagsFromTwt(string $twtString) {
+function replaceTagsFromTwt(string $twtString)
+{
 	//$pattern = '/#(\w+)?/';
 	//$pattern = '/(?<=\s)#(\w+)/';
 	$pattern = '/(?<=\B)#([\p{L}\p{N}_]+)/u';
-	
+
 	//$replacement = '<a href="#">#\1</a>'; // Dummy link
 	$replacement = '<a href="?search=$1" class="tag">#${1}</a>';
 	$result = preg_replace($pattern, $replacement, $twtString);
@ -224,17 +238,18 @@ function replaceTagsFromTwt(string $twtString) {
 	return $result;
 }

-function embedYoutubeFromTwt(string $twtString) {
+function embedYoutubeFromTwt(string $twtString)
+{

-    // original regex source: https://gist.github.com/afeld/1254889#gistcomment-1253992
+	// original regex source: https://gist.github.com/afeld/1254889#gistcomment-1253992
 	$pattern = '/(?:youtube(?:-nocookie)?\.com\/(?:[^\/\n\s]+\/\S+\/|(?:v|e(?:mbed)?)\/|\S*?[?&]v=)|youtu\.be\/)([a-zA-Z0-9_-]{11})/mi';

-	if(preg_match_all($pattern, $twtString, $youtubeLinks)) {
-		
+	if (preg_match_all($pattern, $twtString, $youtubeLinks)) {
+
 		$youtubeLinks = array_unique($youtubeLinks[1]); // Remove dublicate cause by raw URLs conceverter to links

 		foreach ($youtubeLinks as $videoID) {
-			$twtString .= '<iframe loading="lazy" src="https://www.youtube.com/embed/'.$videoID.'" class="embed-video" allow="encrypted-media" title="" allowfullscreen="allowfullscreen" frameborder="0"></iframe>';
+			$twtString .= '<iframe loading="lazy" src="https://www.youtube.com/embed/' . $videoID . '" class="embed-video" allow="encrypted-media" title="" allowfullscreen="allowfullscreen" frameborder="0"></iframe>';
 		}
 	}

@ -244,7 +259,8 @@ function embedYoutubeFromTwt(string $twtString) {
 }


-function getTimeElapsedString($timestamp, $full = false) {
+function getTimeElapsedString($timestamp, $full = false)
+{
 	$now = new DateTime;
 	$ago = new DateTime;
 	$ago->setTimestamp($timestamp);
@ -290,7 +306,8 @@ function getTimeElapsedString($timestamp, $full = false) {
 	return $string ? implode(', ', $string) . " $agoText" : 'just now';
 }

-function getCachedFileContentsOrUpdate($fileURL, $cacheDurationSecs = 15) {
+function getCachedFileContentsOrUpdate($fileURL, $cacheDurationSecs = 15)
+{
 	# TODO: Process the Warning
 	# Warning: file_get_contents(https://eapl.mx/twtxt.net):
 	# failed to open stream: HTTP request failed! HTTP/1.1 404 Not Found in
@ -309,7 +326,8 @@ function getCachedFileContentsOrUpdate($fileURL, $cacheDurationSecs = 15) {
 	return $contents;
 }

-function getCachedFileContents($filePath) {
+function getCachedFileContents($filePath)
+{
 	$cacheFile = getCachedFileName($filePath);

 	// Check if cache file exists and it's not expired
@ -320,31 +338,65 @@ function getCachedFileContents($filePath) {
 	return null;
 }

-function updateCachedFile($filePath, $cacheDurationSecs = 15) {
+function updateCachedFile($filePath)
+{
 	$cacheFilePath = getCachedFileName($filePath);
+	# TODO: Report down URLs and stop loading them after a few tries

-	// File doesn't exist in cache or has expired, so fetch and cache it
-	// TODO: Seems it's not working right!
-	$fileDoesntExist = !file_exists($cacheFilePath);
-	$fileIsOld = false;
-	if (!$fileDoesntExist) {
-		$fileIsOld = !((time() - filemtime($cacheFilePath)) < $cacheDurationSecs);
+	# Get the last modification time of the local file
+	$lastModifiedTime = file_exists($cacheFilePath) ? filemtime($cacheFilePath) : false;
+	$lastModifiedHeader = $lastModifiedTime ? gmdate('D, d M Y H:i:s', $lastModifiedTime) . ' GMT' : null;
+
+	# echo "lastModifiedHeader: $lastModifiedHeader<br>\n";
+
+	# Set up the HTTP context with the 'If-Modified-Since' header
+	$options = [
+		'http' => [
+			'method' => 'GET',
+			'header' => $lastModifiedHeader ? "If-Modified-Since: $lastModifiedHeader\r\n" : '',
+		]
+	];
+
+	$context = stream_context_create($options);
+
+	$response = @file_get_contents($filePath, false, $context);
+
+	# Check if HTTP headers are available, usually when the server is available
+	if (!isset($http_response_header)) {
+		# echo "Failed to fetch headers. No HTTP request was made.\n";
+		return;
 	}

-	if ($fileDoesntExist || $fileIsOld) {
-		#echo "Loading Cached file $cacheFilePath<br>\n";
-		$contents = @file_get_contents($filePath);
+	if ($http_response_header) {
+		# var_dump($http_response_header);

-		if ($contents === false) {
-			// File loaded with errors, skip saving it
-			return;
+		foreach ($http_response_header as $header) {
+			# Look for the Last-Modified header
+			if (preg_match('/^Last-Modified:\s*(.+)$/i', $header, $matches)) {
+				$dateString = $matches[1]; // Extracted date
+				# echo "Extracted Date: $dateString\n";
+
+				// Convert to Unix timestamp
+				$lastModifiedTimestamp = strtotime($dateString);
+				if ($lastModifiedTimestamp > $lastModifiedTime) {
+					# echo "Remote file is newer. Load it!<br>\n";
+				} else {
+					# echo "Not modified since last request. No update needed.<br>\n";
+					return;
+				}
+			}
 		}
+	}

-		file_put_contents($cacheFilePath, $contents);
+	# Save the content if it was successfully retrieved
+	if ($response !== false) {
+		file_put_contents($cacheFilePath, $response);
+		#echo "File updated successfully.\n";
 	}
 }

-function getTwtsFromTwtxtString($url) {
+function getTwtsFromTwtxtString($url)
+{
 	$fileContent = getCachedFileContents($url);

 	if (is_null($fileContent)) {
@ -398,8 +450,8 @@ function getTwtsFromTwtxtString($url) {
 		// mosty for (re)feeds from Mastodon etc.
 		if (str_contains($twtxtData->nick, "@")) {
 			$str = $twtxtData->nick;
-			$str = ltrim($str,"@");
-			$twtxtData->nick = explode("@",$str)[0]; // take the first [0] from splitting the nick at "@"
+			$str = ltrim($str, "@");
+			$twtxtData->nick = explode("@", $str)[0]; // take the first [0] from splitting the nick at "@"
 		}

 		// Fallback for nick and url if not set in twtxt.txt
@ -436,7 +488,7 @@ function getTwtsFromTwtxtString($url) {
 				//$twtContent = str_replace("\u{2028}", "\n<br>\n", $twtContent);
 				$twtContent = str_replace("\u{2028}", "\n", $twtContent);

-				$twtContent = embedYoutubeFromTwt($twtContent); 
+				$twtContent = embedYoutubeFromTwt($twtContent);

 				// Get and remove the hash
 				$hash = getReplyHashFromTwt($twtContent);
@ -454,7 +506,7 @@ function getTwtsFromTwtxtString($url) {
 				//$twtContent = replaceLinksFromTwt($twtContent);

 				// TODO: Make ?tag= filtering feature
-				$twtContent = replaceTagsFromTwt($twtContent); 
+				$twtContent = replaceTagsFromTwt($twtContent);

 				// TODO: Get mentions
 				$mentions = getMentionsFromTwt($twtContent);
@ -493,7 +545,8 @@ function getTwtsFromTwtxtString($url) {
 	return $twtxtData;
 }

-function insertFollowingURL($urlString) {
+function insertFollowingURL($urlString)
+{
 	// Check if it's a valid URL
 	// Retrieve the nickname, if didn't find a nick, ask for one

@ -509,22 +562,26 @@ function insertFollowingURL($urlString) {
 	echo $result;
 }

-function getCachedFileName($filePath) {
+function getCachedFileName($filePath)
+{
 	return __DIR__ . '/../private/cache/' . hash('sha256', $filePath); // TODO: make better path
 }

 if (!function_exists('str_starts_with')) {
-	function str_starts_with($haystack, $needle) {
+	function str_starts_with($haystack, $needle)
+	{
 		return (string)$needle !== '' && strncmp($haystack, $needle, strlen($needle)) === 0;
 	}
 }
 if (!function_exists('str_ends_with')) {
-	function str_ends_with($haystack, $needle) {
+	function str_ends_with($haystack, $needle)
+	{
 		return $needle !== '' && substr($haystack, -strlen($needle)) === (string)$needle;
 	}
 }
 if (!function_exists('str_contains')) {
-	function str_contains($haystack, $needle) {
+	function str_contains($haystack, $needle)
+	{
 		return $needle !== '' && mb_strpos($haystack, $needle) !== false;
 	}
 }