From df2cc131676f4ba1e6c92d85c21910293341ce0e Mon Sep 17 00:00:00 2001 From: Adam Piontek Date: Thu, 1 Apr 2021 18:09:52 -0400 Subject: [PATCH] post parsing now builds a 'corpus' for each post, useful for future search feature, and ... seo keywords? --- .iex.exs | 2 + lib/home73k/blog.ex | 7 +- lib/home73k/blog/post.ex | 143 ++++++++++++++++++++++----------------- mix.lock | 2 + 4 files changed, 88 insertions(+), 66 deletions(-) create mode 100644 .iex.exs diff --git a/.iex.exs b/.iex.exs new file mode 100644 index 0000000..701cbda --- /dev/null +++ b/.iex.exs @@ -0,0 +1,2 @@ +alias Home73k.Blog +alias Home73k.Blog.Post diff --git a/lib/home73k/blog.ex b/lib/home73k/blog.ex index c3e79ab..8092e2a 100644 --- a/lib/home73k/blog.ex +++ b/lib/home73k/blog.ex @@ -17,13 +17,12 @@ defmodule Home73k.Blog do Post.parse!(post_path) end - @posts posts - # @posts Enum.sort_by(posts, & &1.date, {:desc, NaiveDateTime}) + @posts Enum.sort_by(posts, & &1.date, {:desc, NaiveDateTime}) - # @tags posts |> Stream.flat_map(& &1.tags) |> Stream.uniq() |> Enum.sort() + @tags posts |> Stream.flat_map(& &1.tags) |> Stream.uniq() |> Enum.sort() def list_posts, do: @posts - # def list_tags, do: @tags + def list_tags, do: @tags # defmodule NotFoundError do # defexception [:message, plug_status: 404] diff --git a/lib/home73k/blog/post.ex b/lib/home73k/blog/post.ex index 2f347b5..dd39f24 100644 --- a/lib/home73k/blog/post.ex +++ b/lib/home73k/blog/post.ex @@ -1,8 +1,8 @@ defmodule Home73k.Blog.Post do - @enforce_keys [:title, :slug, :date, :author, :tags, :lede, :body] - defstruct [:title, :slug, :date, :author, :tags, :lede, :body] + @enforce_keys [:title, :slug, :date, :author, :tags, :lede, :body, :corpus] + defstruct [:title, :slug, :date, :author, :tags, :lede, :body, :corpus] - @title_slug_regex ~r/[^a-zA-Z0-9 ]/ + @strip_words ~w(the and are for not but had has was all any too one you his her can that with have this will your from they want been much some very them into which then now get its youll youre) @doc """ The public parse!/1 function begins the post parse process by reading @@ -15,13 +15,19 @@ defmodule Home73k.Blog.Post do |> split_raw_file_data() |> parse_frontmatter() |> parse_lede() + |> parse_body() + |> build_corpus() + |> build_post() end # """ split_raw_file_data/1 # If we receive {:ok, file_data}, we split frontmatter from markdown # content and return [raw_frontmatter, markdown]. Otherwise return nil. # """ - defp split_raw_file_data({:ok, file_data}), do: String.split(file_data, ~r/\n-{3,}\n/, parts: 2) + defp split_raw_file_data({:ok, file_data}) do + file_data |> String.split("---", parts: 2, trim: true) + end + defp split_raw_file_data(_), do: nil # """ parse_frontmatter/1 @@ -30,7 +36,7 @@ defmodule Home73k.Blog.Post do # """ defp parse_frontmatter([fm, md]) do case parse_frontmatter_string(fm) do - {%{} = parsed_fm, _} -> {parsed_fm, md} + {%{} = parsed_fm, _} -> {set_post_slug(parsed_fm), String.trim(md)} {:error, _} -> nil end end @@ -47,11 +53,50 @@ defmodule Home73k.Blog.Post do defp parse_lede(_), do: nil - # TODO: - # |> parse_body() - # - convert to markdown - # - extract any code parts to mark with pygments? - # - figure that whole thing out + # """ parse_body/1 + # Convert body markdown to html + # TODO: handle syntax highlighting + defp parse_body({fm, md}) do + Map.put(fm, :body, Earmark.as_html!(md)) + end + + defp parse_body(_), do: nil + + # """ build_corpus/1 + # Create a searchable word list for the post, for live searching + defp build_corpus(%{title: title, lede: lede, body: body, tags: tags} = post_data) do + # initialize corpus string from: title, lede, body, tags + corpus = (tags ++ [title, (lede && lede) || " ", body]) |> Enum.join(" ") |> String.downcase() + + # scrub out (but replace with spaces): + # code blocks, html tags, html entities, newlines, forward and back slashes + html_scrub_regex = ~r/(
<\/pre>)|(<(.|\n)+?>)|(&#(.)+?;)|(&(.)+?;)|\n|\/|\\/
+    corpus = Regex.replace(html_scrub_regex, corpus, " ")
+
+    # restrict corpus to letters & numbers,
+    # then split to words (space delim), trimming as we go
+    # then reject all 0, 1, 2-letter words, and words in @strip_words
+    # reduce to unique words and join back to space-delim string
+    corpus =
+      Regex.replace(~r/[^a-z0-9 ]/, corpus, "")
+      |> String.split(" ", trim: true)
+      |> Stream.reject(&reject_word?/1)
+      |> Stream.uniq()
+      |> Enum.join(" ")
+
+    # Finally, return post_data with corpus
+    Map.put(post_data, :corpus, corpus)
+  end
+
+  defp build_corpus(_), do: nil
+
+  # """ build_post/1
+  # Create post struct from post data map
+  defp build_post(%{} = post_data) do
+    struct!(__MODULE__, post_data)
+  end
+
+  defp build_post(_), do: nil
 
   ######################################################################
   # HELPERS
@@ -74,62 +119,36 @@ defmodule Home73k.Blog.Post do
   # Handle split of post body. If lede found, return as html with body.
   # Otherwise return nil with body.
   # """
-  defp extract_lede([lede, body]), do: {Earmark.as_html!(lede), body}
+  defp extract_lede([lede, body]),
+    do: {String.trim_trailing(lede) |> Earmark.as_html!(), String.trim_leading(body)}
+
   defp extract_lede([body]), do: {nil, body}
 
-  # ##################################################
-  # ##################################################
-  # ##################################################
-  # ##################################################
-  # ##################################################
-  # defp parse_split_file_data(["", fm, md]) do
-  #   Code.eval_string(fm)
-  #   |> parse_lede(md)
-  # end
+  # """ set_frontmatter_slug
+  # If no slug in frontmatter, convert title to slug and add to map
+  # """
+  defp set_post_slug(%{slug: _} = fm), do: fm
 
-  # defp parse_split_file_data(_), do: nil
+  defp set_post_slug(%{title: title} = fm) do
+    Map.put(fm, :slug, parse_title_to_slug(title))
+  end
 
-  # defp parse_lede({%{summary: summ} = fm, _}, md) do
-  #   Earmark.as_html(md)
-  #   |> parse_post(Earmark.as_html(summ), fm)
-  # end
+  # """ parse_title_to_slug
+  # Takes a post title and returns a slug cleansed for URI request path
+  # """
+  defp parse_title_to_slug(title) do
+    title = String.downcase(title)
 
-  # defp parse_lede({%{} = fm, _}, md) do
-  #   String.split(md, "", parts: 2)
-  #   |> parse_lede(fm)
-  # end
+    Regex.replace(~r/[^a-z0-9 ]/, title, "")
+    |> String.split(" ", trim: true)
+    |> Stream.reject(&reject_word?/1)
+    |> Enum.join("-")
+  end
 
-  # defp parse_lede([summ, _] = parts, fm) do
-  #   parts
-  #   |> Enum.join(" ")
-  #   |> Earmark.as_html()
-  #   |> parse_post(Earmark.as_html(summ), fm)
-  # end
-
-  # defp parse_lede(md, fm) do
-  #   Earmark.as_html(md)
-  #   |> parse_post({:ok, nil, []}, fm)
-  # end
-
-  # defp parse_title_to_slug(title) do
-  #   Regex.replace(@title_slug_regex, title, "")
-  #   |> String.replace(" ", "-")
-  #   |> String.downcase()
-  # end
-
-  # defp build_post(main_html, summ_html, fm) do
-  #   fm
-  #   |> Map.put_new(:slug, parse_title_to_slug(fm.title))
-  #   |> Map.put_new(:author, "Author Name")
-  #   |> Map.put_new(:tags, [])
-  #   |> Map.put(:summary, summ_html)
-  #   |> Map.put(:body, main_html)
-  # end
-
-  # defp parse_post({:ok, main_html, _}, {:ok, summ_html, _}, fm) do
-  #   post = build_post(main_html, summ_html, fm)
-  #   struct!(__MODULE__, post)
-  # end
-
-  # defp parse_post(_, _, _), do: nil
+  # """ reject_word?
+  # Determines if a word should be rejected, based on char length < 3,
+  # or if word is in @strip_words
+  # Used by parse_title_to_slug and build_corpus
+  # """
+  defp reject_word?(word), do: String.length(word) < 3 || word in @strip_words
 end
diff --git a/mix.lock b/mix.lock
index df85b5d..0e4318f 100644
--- a/mix.lock
+++ b/mix.lock
@@ -13,6 +13,7 @@
   "git_cli": {:hex, :git_cli, "0.3.0", "a5422f9b95c99483385b976f5d43f7e8233283a47cda13533d7c16131cb14df5", [:mix], [], "hexpm", "78cb952f4c86a41f4d3511f1d3ecb28edb268e3a7df278de2faa1bd4672eaf9b"},
   "hackney": {:hex, :hackney, "1.17.4", "99da4674592504d3fb0cfef0db84c3ba02b4508bae2dff8c0108baa0d6e0977c", [:rebar3], [{:certifi, "~>2.6.1", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "~>6.1.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "~>1.0.0", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:parse_trans, "3.3.1", [hex: :parse_trans, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "~>1.1.0", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}, {:unicode_util_compat, "~>0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "de16ff4996556c8548d512f4dbe22dd58a587bf3332e7fd362430a7ef3986b16"},
   "html_entities": {:hex, :html_entities, "0.5.2", "9e47e70598da7de2a9ff6af8758399251db6dbb7eebe2b013f2bbd2515895c3c", [:mix], [], "hexpm", "c53ba390403485615623b9531e97696f076ed415e8d8058b1dbaa28181f4fdcc"},
+  "html_sanitize_ex": {:hex, :html_sanitize_ex, "1.4.1", "e8a67da405fe9f0d1be121a40a60f70811192033a5b8d00a95dddd807f5e053e", [:mix], [{:mochiweb, "~> 2.15", [hex: :mochiweb, repo: "hexpm", optional: false]}], "hexpm", "68d92656f47cd73598c45ad2394561f025c8c65d146001b955fd7b517858962a"},
   "idna": {:hex, :idna, "6.1.1", "8a63070e9f7d0c62eb9d9fcb360a7de382448200fbbd1b106cc96d3d8099df8d", [:rebar3], [{:unicode_util_compat, "~>0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "92376eb7894412ed19ac475e4a86f7b413c1b9fbb5bd16dccd57934157944cea"},
   "jason": {:hex, :jason, "1.2.2", "ba43e3f2709fd1aa1dce90aaabfd039d000469c05c56f0b8e31978e03fa39052", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "18a228f5f0058ee183f29f9eae0805c6e59d61c3b006760668d8d18ff0d12179"},
   "makeup": {:hex, :makeup, "1.0.5", "d5a830bc42c9800ce07dd97fa94669dfb93d3bf5fcf6ea7a0c67b2e0e4a7f26c", [:mix], [{:nimble_parsec, "~> 0.5 or ~> 1.0", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "cfa158c02d3f5c0c665d0af11512fed3fba0144cf1aadee0f2ce17747fba2ca9"},
@@ -20,6 +21,7 @@
   "metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm", "69b09adddc4f74a40716ae54d140f93beb0fb8978d8636eaded0c31b6f099f16"},
   "mime": {:hex, :mime, "1.5.0", "203ef35ef3389aae6d361918bf3f952fa17a09e8e43b5aa592b93eba05d0fb8d", [:mix], [], "hexpm", "55a94c0f552249fc1a3dd9cd2d3ab9de9d3c89b559c2bd01121f824834f24746"},
   "mimerl": {:hex, :mimerl, "1.2.0", "67e2d3f571088d5cfd3e550c383094b47159f3eee8ffa08e64106cdf5e981be3", [:rebar3], [], "hexpm", "f278585650aa581986264638ebf698f8bb19df297f66ad91b18910dfc6e19323"},
+  "mochiweb": {:hex, :mochiweb, "2.20.1", "e4dbd0ed716f076366ecf62ada5755a844e1d95c781e8c77df1d4114be868cdf", [], [], "hexpm", "d1aeee7870470d2fa9eae0b3d5ab6c33801aa2d82b10e9dade885c5c921b36aa"},
   "nimble_parsec": {:hex, :nimble_parsec, "1.1.0", "3a6fca1550363552e54c216debb6a9e95bd8d32348938e13de5eda962c0d7f89", [:mix], [], "hexpm", "08eb32d66b706e913ff748f11694b17981c0b04a33ef470e33e11b3d3ac8f54b"},
   "parse_trans": {:hex, :parse_trans, "3.3.1", "16328ab840cc09919bd10dab29e431da3af9e9e7e7e6f0089dd5a2d2820011d8", [:rebar3], [], "hexpm", "07cd9577885f56362d414e8c4c4e6bdf10d43a8767abb92d24cbe8b24c54888b"},
   "phoenix": {:hex, :phoenix, "1.5.8", "71cfa7a9bb9a37af4df98939790642f210e35f696b935ca6d9d9c55a884621a4", [:mix], [{:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}, {:phoenix_html, "~> 2.13", [hex: :phoenix_html, repo: "hexpm", optional: true]}, {:phoenix_pubsub, "~> 2.0", [hex: :phoenix_pubsub, repo: "hexpm", optional: false]}, {:plug, "~> 1.10", [hex: :plug, repo: "hexpm", optional: false]}, {:plug_cowboy, "~> 1.0 or ~> 2.2", [hex: :plug_cowboy, repo: "hexpm", optional: true]}, {:plug_crypto, "~> 1.1.2 or ~> 1.2", [hex: :plug_crypto, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "35ded0a32f4836168c7ab6c33b88822eccd201bcd9492125a9bea4c54332d955"},