using System.Text.RegularExpressions; namespace SiteMapLibrary; public class SiteMap : IDisposable { private HttpClient _client; private HashSet _foundUrls; private HashSet _visitedUrls; private Queue _visitQueue; private bool _disposed = false; private XmlManager XmlManager { get; set; } public string? Url { get; private set; } public Regex Regexp { get; set; } public SiteMap(string url, string savepath, Regex pattern) { Url = url; _client = new HttpClient(); _foundUrls = new HashSet(); _visitedUrls = new HashSet(); _visitQueue = new Queue(); Regexp = pattern; XmlManager = new XmlManager(savepath); } public SiteMap(string url, XmlManager mgr, Regex pattern) { _client = new HttpClient(); _foundUrls = new HashSet(); _visitedUrls = new HashSet(); _visitQueue = new Queue(); Regexp = pattern; Url = url; XmlManager = mgr; } public async Task Crawl() { while (Url != null) { _visitedUrls.Add(Url); using var content = await _client.GetAsync(Url); if (!content.IsSuccessStatusCode) { Console.WriteLine($"{content.StatusCode} on url: {Url}"); NextUrl(); continue; } var m = Regexp.Match(await content.Content.ReadAsStringAsync()); while (m.Success) { foreach (Group group in m.Groups) { if (_foundUrls.Add(group.Value)) { Console.WriteLine(group.Value); // Console.WriteLine(content.Content.Headers.LastModified); if (!_visitedUrls.Contains(group.Value) && !_visitQueue.Contains(group.Value)) { _visitQueue.Enqueue(group.Value); } } } m = m.NextMatch(); } NextUrl(); content.Dispose(); } WriteXml(); } private void WriteXml() { List urls = new List(_visitedUrls.OrderBy(k => k.Length).ToArray()); foreach (string url in urls) { XmlManager.AddUrl(url); } XmlManager.Save(); } private void NextUrl() { if (_visitQueue.Count == 0) { Url = null; return; } Url = _visitQueue.Dequeue(); } public void Dispose() { Dispose(true); GC.SuppressFinalize(this); } public void Dispose(bool disposing) { if (!_disposed) { if (disposing) { _client.Dispose(); } _disposed = true; } } }