Formalizing Regex Derivative in Lean
Brzozowski's derivative is truly a gem in the literature and surprised me with its beauty when I first learned about it towards the end of 2023, thanks to the reference by Guannan Wei. Derivatives of a language $r$ with respect to a string $\pi$, in a nutshell, is a new language containing all strings from the input language when appended to π. One propertty of derivatives that initially challenged my understanding is $d_{\pi} r^{\ast} = d_{\pi}r \cdot r^{\ast}$. Through this post, I aim to sharpen my skills in proof mechanization and learn some Lean by formalizing my reasoning about this intriguing property.
Mathlib Import
import Mathlib.Data.Prod.Lex
import Mathlib.Order.BooleanAlgebra
import Mathlib.Tactic.Linarith
Some Auxiliary Definitions
class Denotation (α : Type) (σ : outParam (Type)) where
denote : α -> Set σ
class EffectiveBooleanAlgebra (α : Type) (σ : outParam (Type))
extends Denotation α σ, Bot α, Top α, HasCompl α, Inf α, Sup α where
denote_bot : denote ⊥ = ∅
denote_top : denote ⊤ = univ
denote_compl : denote (compl a) = univ \ denote a
denote_inf : denote (a ⊓ b) = denote a ∩ denote b
denote_sup : denote (a ⊔ b) = denote a ∪ denote b
Regex Definition
variable (α : Type) in
inductive Regex where
| ε : Regex
| atom : α -> Regex
| concat : Regex -> Regex -> Regex
| star : Regex -> Regex
| neg : Regex -> Regex
| inter : Regex -> Regex -> Regex
| union : Regex -> Regex -> Regex
open Regex
infixr:50 " ⬝ " => concat
postfix:max "*" => star
prefix:max "~" => neg
infixr:40 " ⋒ " => inter
infixr:35 " ⋓ " => union
Regex Denotation
@[simp]
def star_metric : Regex α -> (ℕ ×ₗ ℕ)
| ε => (0, 1)
| atom _ => (0, 1)
| concat r1 r2 | inter r1 r2 | union r1 r2 =>
let (h1, n1) := star_metric r1
let (h2, n2) := star_metric r2
(max h1 h2, n1 + n2)
| star r => let (h, n) := star_metric r; (h + 1, n + 1)
| neg r => let (h, n) := star_metric r; (h, n + 1)
instance : WellFoundedRelation (ℕ ×ₗ ℕ) where
rel := (· < ·)
wf := WellFounded.prod_lex WellFoundedRelation.wf WellFoundedRelation.wf
theorem zero_lt_size (r : Regex α) : 0 < (star_metric r).snd :=
by induction r using Regex.rec with
| ε | atom _ => simp
| concat r1 r2 ih1 ih2 | inter r1 r2 ih1 ih2 | union r1 r2 ih1 ih2 =>
simp; split; rename_i h1 n1 heq1; split; rename_i h2 n2 heq2; simp;
rewrite [heq1] at ih1; rewrite [heq2] at ih2; simp at ih1 ih2;
apply Or.intro_left; assumption
| star r | neg r => simp; split; simp
def repeat_cat (R : Regex α) (n : ℕ) : Regex α :=
match n with
| 0 => ε
| Nat.succ n => R ⬝ (repeat_cat R n)
notation f "⁽" n "⁾" => repeat_cat f n
namespace Regex
variable {α σ : Type} [EffectiveBooleanAlgebra α σ]
def denote (r : Regex α) : Set (List σ) :=
match r with
| Regex.ε => ∅
| Regex.atom ℓ => {[a] | a ∈ Denotation.denote ℓ}
| Regex.concat r1 r2 => (denote r1) ∪ (denote r2)
| Regex.star r => {π | ∃ i : ℕ, π ∈ denote (repeat_cat r i)}
| Regex.neg r => Set.univ \ denote r
| Regex.inter r1 r2 => denote r1 ∩ denote r2
| Regex.union r1 r2 => denote r1 ∪ denote r2
termination_by star_metric r
decreasing_by
all_goals (
simp only [InvImage, WellFoundedRelation.rel, star_metric]
try (unfold LT.lt Prod.Lex.instLT max Nat.instMax maxOfLe))
any_goals (
split; rename_i h1 n1 heq1; split; rename_i h2 n2 heq2; rw [heq1]; simp
by_cases h1 < h2 <;> rename_i hineqh
· apply Prod.Lex.left; split; assumption; linarith
· by_cases h1 = h2 <;> rename_i heqh <;>
have pos2 : 0 < (star_metric r2).snd := zero_lt_size r2<;>
rw [heq2] at pos2<;> simp at pos2
· rw [heqh]; simp; apply Prod.Lex.right; linarith
· have disj : h1 = h2 ∨ h2 < h1 := Nat.eq_or_lt_of_not_lt hineqh;
have : h1 > h2 := (by
cases disj with
| inl h1eqh2 => contradiction
| inr h2lth1 => linarith);
split; linarith; apply Prod.Lex.right; linarith)
any_goals (
split; rename_i h2 n2 heq2; split; rename_i h1 n1 heq1; rw [heq1]; simp
by_cases h1 < h2 <;> rename_i hineqh
· apply Prod.Lex.left; split; linarith; assumption
· by_cases h1 = h2 <;> rename_i heqh <;>
have pos1 : 0 < (star_metric r1).snd := zero_lt_size r1<;>
rw [heq2] at pos1<;> simp at pos1
· rw [heqh]; simp; apply Prod.Lex.right; linarith
· have disj : h1 = h2 ∨ h2 < h1 := Nat.eq_or_lt_of_not_lt hineqh;
have : h1 > h2 := (by
cases disj with
| inl h1eqh2 => contradiction
| inr h2lth1 => linarith);
split; apply Prod.Lex.right; linarith; linarith)
-- repeat (first | (
-- split; rename_i h1 n1 heq1; split; rename_i h2 n2 heq2; rw [heq1]; simp
-- by_cases h1 < h2 <;> rename_i hineqh
-- · apply Prod.Lex.left; split; assumption; linarith
-- · by_cases h1 = h2 <;> rename_i heqh <;>
-- have pos2 : 0 < (star_metric r2).snd := zero_lt_size r2<;>
-- rw [heq2] at pos2<;> simp at pos2
-- · rw [heqh]; simp; apply Prod.Lex.right; linarith
-- · have disj : h1 = h2 ∨ h2 < h1 := Nat.eq_or_lt_of_not_lt hineqh;
-- have : h1 > h2 := (by
-- cases disj with
-- | inl h1eqh2 => contradiction
-- | inr h2lth1 => linarith);
-- split; linarith; apply Prod.Lex.right; linarith)
-- )
end Regex