ALib C++ Framework
by
Library Version: 2605 R0
Documentation generated by doxygen
Loading...
Searching...
No Matches
fscanner.hpp
Go to the documentation of this file.
1//==================================================================================================
2/// \file
3/// This header-file is part of module \alib_filetree of the \aliblong.
4///
5/// Copyright 2013-2026 A-Worx GmbH, Germany.
6/// Published under #"mainpage_license".
7//==================================================================================================
8ALIB_EXPORT namespace alib { namespace filetree {
9
10#if ALIB_DEBUG
11/// The format string used with verbose logging to domain <c>/ALIB/FILETREE/SCAN</c> by the namespace
12/// function #"ScanFiles(FTree&)".<br>
13/// Defaults to <c>" {:ta h{2,r} on{10,r} gn{10,r} s(IEC){10,r} dm qqq nf l}"</c>
15#endif
16
17
18/// Input parameters to function #"ScanFiles(FTree&)".
20 /// Options for processing symbolic links.
21 enum class SymbolicLinks {
22 DONT_RESOLVE = 0, ///< Demands \b not to resolve symbolic links in any way.
23 RESOLVE_BUT_DONT_FOLLOW = 1, ///< Demands to read symbolic links, but not follow linked directories.
24 ///< FTValue dates, sizes, and access rights are set according to
25 ///< the link target.
26 RECURSIVE = 2, ///< Read symbolic links and in case they are targeting a
27 ///< directory, recurse into, if this directory meets the
28 ///< other constraints associated with the current scan.
29 };
30
31 /// Denotes 'infinite' recursion if set to field #"MaxDepth".
32 static constexpr unsigned InfiniteRecursion = (std::numeric_limits<unsigned>::max)();
33
34 /// The path to be scanned.
36
37 /// Denotes how symbolic links are treated.
39
40 /// The maximum recursion depth. Defaults to #"InfiniteRecursion".
42
43 /// If \c true, the default, scanning does not stop recursion on directories which represent
44 /// a mounted filesystem. If \c false, the search is restricted to the device that #"StartPath"
45 /// resides in.
46 bool CrossFileSystems = true;
47
48 /// If \c false (the default), scanning aborts if \e 'artificial' filesystems are found.
49 /// Artificial filesystems under GNU/Linux, are for example:
50 /// <c>/proc</c>, <c>/dev</c>, <c>/run</c>, <c>/sys</c>, and <c>/temp</c>.
51 bool IncludeArtificialFS = false;
52
53 /// If \c false, empty directories remain in the result tree. Otherwise, they are deleted
54 /// and do not appear in the tree.
56
57 /// If set (not containing \c nullptr), files are passed to this filter and removed if \c false
58 /// is returned.<br>
59 /// The term "files" here means all sorts of files except Directories.
60 /// Directories are either real directories, or in case the field #"LinkTreatment" is set to
61 /// #"SymbolicLinks::RECURSIVE", symbolic links that
62 /// target a directory.
63 ///
64 /// \see Optional filters #"DirectoryFilterPreRecursion" and #"DirectoryFilterPostRecursion".
66
67 /// If set (not containing \c nullptr), this filter is invoked \b after a recursive scan of
68 /// a directory. If \c false is returned, the recursion is not performed, but the (empty)
69 /// directory remains in the result list, if field #"RemoveEmptyDirectories" evaluates to
70 /// \c false.<br>
71 /// Note that in case field #"LinkTreatment" is set to
72 /// #"SymbolicLinks::RECURSIVE", this filter
73 /// is also applied to symbolic links, which are readable, not broken, and target a directory.
74 ///
75 /// \note
76 /// Directories (and symbolic links to directories) are first recursively scanned before this
77 /// filter is applied. On deletion, of course the whole scanned subtree is deleted.
78 /// This allows filtering directories, depending on information available only after
79 /// scanning, hence by the numbers retrieved with #"FTValue::Sums;*".
80 /// To increase performance and filter directories \e before their recursive scan,
81 /// alternative field #"DirectoryFilterPreRecursion" is to be used.
82 ///
83 /// \see Optional filters #"DirectoryFilterPreRecursion" and #"FileFilter".
84 ///
86
87 /// Same as #".DirectoryFilterPostRecursion" but is used \b before a recursive scan of
88 /// a directory. Consequently, this filter leads to much higher scan performance than the
89 /// alternative version, because huge branches of the file system might be omitted during scan.
90 /// However, the numbers retrieved with #"FTValue::Sums;*" will all indicate
91 /// \c 0, because no information is retrieved.<br>
92 /// If a directory is "pruned" due to this filter, the entry still occurs in the #"%FTree",
93 /// unless field #".RemoveEmptyDirectories" evaluates to \c true.<br>
94 ///
95 /// \see Optional filters #".DirectoryFilterPostRecursion" and #".FileFilter".
96 ///
98
99 /// Constructor accepting all features.
100 /// @param startPath Stored in field #".StartPath".
101 /// @param linkTreatment Stored in field #".LinkTreatment". Defaults to #"%SymbolicLinks::RECURSIVE".
102 /// @param maxDepth Stored in field #".MaxDepth". Defaults to +InfiniteRecursion.
103 /// @param crossFileSystems Stored in field #".CrossFileSystems". Defaults to \c true.
104 /// @param includeArtificialFS Stored in field #".IncludeArtificialFS". Defaults to \c false.
105 ScanParameters( const PathString& startPath,
107 unsigned maxDepth = InfiniteRecursion,
108 bool crossFileSystems = true,
109 bool includeArtificialFS= false )
110 : StartPath (startPath )
111 , LinkTreatment (linkTreatment )
112 , MaxDepth (maxDepth )
113 , CrossFileSystems (crossFileSystems )
114 , IncludeArtificialFS(includeArtificialFS) {}
115
116}; // struct ScanParameters
117
118/// A simple vector containing nodes of an #"FTree". Such nodes are collected during calls
119/// of the function #"ScanFiles". One call (aka during the recursive scan of one path) can result
120/// in more than one entry in this list, because with resolving symbolic links new isolated
121/// siblings can occur.<br>
122/// The single new method of this type is #".Add", which checks if the given new start-path
123/// is superseding others or is superseded itself by an existing path. In that case the
124/// superseded path is deleted.
125///
126/// Despite the little effort that \alib takes with the provision of these mechanics, often the
127/// analysis of, or a loop through this path list is not necessary. This is because most
128/// using code would just scan one or more paths and then #"StringTreeIterator;loop through" just
129/// all resulting directory and file nodes that have been inserted into the tree.
130/// Consequently, the function #"ScanFiles" accepts an instance of this class only optionally.
131struct CanonicalPathList : std::vector<FTFile> {
132 /// Adds the given node to the list, in the case it is not superseded by an already
133 /// collected node. Vice versa, existing nodes that are superseded by the given one are removed.
134 /// @param node The node to add.
136 void Add(FTFile node);
137};
138
139
140/// ### General Information ###
141/// Scans the filesystem according to the given #"%ScanParameters" and adds #"FTValue"
142/// entries to the given #"FTree".
143///
144/// ### ALib FTree Data Contract ###
145/// This function has a contract with the class #"FTree" that is used to store the scan results.
146/// This contract states that any file or directory found during a scan is always stored using
147/// the <em>"Real Path"</em> of the entry. This means that any symbolic link is resolved.
148/// The consequences are:
149/// - Files and directories which represent a symbolic link are always "leaf nodes".
150/// (They never contain child nodes.). However, their symlink target path is attached twice
151/// to the entry:
152/// 1. The original link information given, which often uses relative path addressing.
153/// 2. The absolute, <em>"Real Path"</em> of the target, which has a corresponding result entry
154/// in the given #"%FTree".
155/// - If a using software wants to use symbolic paths, for example, to present them to the end
156/// user, such paths have to be assembled by the user's code in own responsibility.
157/// All information for doing this is provided in the resulting tree object
158/// - If symbolic path reconstruction is needed, nodes that participate in symbolic backlinks
159/// must stay alive. Therefore, deleting nodes after scanning (or enabling scan options that
160/// delete nodes during scanning) is not compatible with this feature.
161/// - Doubly linked target files and directories are never a problem for this scanner. Each
162/// file is scanned only once. This especially prevents all sorts of problems that would otherwise
163/// occur with cyclic symbolic links.
164/// - Due to this, even the given start path of a search might not be found as a result
165/// in the given #"%FTree", because also start paths are converted to a <em>Real Path</em>.
166/// - The scan result may contain more than one resulting path. This happens if a symbolic link
167/// targets a file or directory not recursively included in the start path.
168/// The resulting <em>"Real Path"</em> of the given start path is, however, always the first
169/// result added.
170///
171/// The latter is reflected with (optional) parameter \p{resultPaths} of this function, which is
172/// of type #"CanonicalPathList".
173///
174/// \note
175/// Because the class #"FTree" is based on class #"StringTree", using code
176/// is enabled to break this contract by adding entries below symbolic links.
177/// Other entities of this \alibmod_nl will not break this contract.
178///
179/// ### Rescanning of Entries ###
180/// Existing entries in the given \p{tree} are not overwritten. They might be scanned with "higher"
181/// #"FileStatus::ScanStates;*" values, depending on given \p{parameters} and how they had been
182/// scanned before. If the same "level" of scanning is provided, existing entries will not be
183/// scanned again. If a rescan of a certain path is wanted, then the target entry of that path has
184/// to be deleted before invoking this function. Due to the implementation of class FTree, repeated
185/// delete and scan operations will not cause any heap-memory allocations or deallocations.
186///
187/// ### platform-dependent Code Selection ###
188/// File scanning is a platform-dependent task and hence \alib uses one of two different
189/// implementations:
190/// 1. A POSIX version for compatible OSes,
191/// 2. A version that relies on <c>C++ std::filesystem</c>.
192///
193/// The fallback version using <c>std::filesystem</c> has the following restrictions:
194/// - The only time attribute available is the #"FileStatus::MDate;modification time" of
195/// an entry. The fields #"FileStatus::BDate", #"FileStatus::CDate", and #"FileStatus::ADate" are always set
196/// to the same as the modification time, even on filesystems that support the other values.
197/// - The file time of symbolic links is \b always that of the target file. The C++ standard has
198/// no possibility to access the link's time itself.
199/// - The file time of broken symbolic links is set to the current time (time of scanning).
200/// - The size that directories occupy on a disk cannot be determined.
201/// Directory entries always report size <c>0</c>.
202/// - The target of a symbolic link which points to a non-accessible directory, cannot be resolved
203/// to a "real" (aka canonical) path, even if all other path components before were accessible.
204/// (This is true for the implementation of the standard library under GNU/Linux and Clang
205/// compiler at the time of writing this, 2024/02.)
206/// - The flag #"ScanParameters::CrossFileSystems;*" is ignored. Crossing Filesystems cannot
207/// be detected using purely the standard library.<br>
208/// - A files' owner and owning group is not determined. Instead, #"FileStatus::UnknownID;*"
209/// is set for both.
210/// - The scanning process is half as fast as in the Posix version. The reason for this is probably
211/// the internal allocation and deallocation of many quite volatile string objects in the C++
212/// standard library.
213/// Well, but it is still fast though!
214///
215/// \note As for today, using this module under WindowsOS, will fall back to the
216/// <em>C++ std::filesystem</em> version. It may be that a future version will provide a
217/// native implementation of this target system. Volunteers from the community are welcome to
218/// contribute.
219///
220/// @param tree The tree to fill.
221/// @param parameters The input parameters to determine the scan process.
222/// @param[out] resultPaths An optional container to store the result paths of a scan.
223/// If \c nullptr is given, the result paths are not collected. See the
224/// #"CanonicalPathList;types documentation" for further information.
225/// @param[out] remainingStart An optional path string. If given, on failure, it will receive the
226/// remainder of the path given with #"ScanParameters::StartPath;2"
227/// starting with the first directory or file that could not be resolved
228/// or accessed.
229///
230/// @return The scan state code of the tree node of the first resulting path, hence of the node
231/// referred to by the given #"ScanParameters::StartPath;2".<br>
232/// On error, i.e. if the start path was invalid, not accessible, a broken link, a circular
233/// link, or other failures, #"ScanStates::NOT_EXISTENT" is returned.
236 ScanParameters& parameters,
237 CanonicalPathList* resultPaths = nullptr,
238 Path* remainingStart = nullptr );
239
240/// Classification of path root formats that cannot be directly scanned
241enum class PathRootKind {
242 Errorneous, ///< Errorneous path format.
243 Relative, ///< Relative path (no special root)
244 AbsoluteRoot, ///< Unix-style absolute path starting with /
245 DriveLetter, ///< Windows drive letter (C:, D:, etc)
246 UNC, ///< Universal Naming Convention (\\\\server\\share)
247 URL, ///< URL scheme (http://, ftp://, file://, etc)
248 Device, ///< Windows device path (\\\\.\\...)
249};
250
251/// Result information from MakeCanonical
253 PathRootKind RootKind; ///< What kind of path root was detected
254 FileStatus::ScanStates ScanState; ///< Result of canonicalization/scanning
255};
256
257/// Analyses the given \p{sourcePath} and converts it to its canonical version.
258/// This is similar to what the POSIX function <c>realpath()</c> and C++
259/// <c>std::filesystem::canonical</c> do.<br>
260/// This version, in addition, creates corresponding nodes in the #"FTree" (passed indirectly with
261/// the parameter \p{node}). Besides removing <c>"."</c> and <c>".."</c> entries, symbolic links
262/// are not only resolved, but the nodes they are targeting receive information about the link
263/// that targeted them. This information is set with the method #"FTFile::SetSymbolicParent(FTFile)".
264/// With that, the path of directories or files that are children of such a targeted node, can
265/// re-establish the file-path as originally specified. This is done with the method
266/// #"FTFile::AssembleSymbolicPath".
267///
268/// ## Path Root Detection and Tree Representation
269/// The method detects and handles various path root formats, creating appropriate tree nodes:
270///
271/// - <b>Relative paths</b> (#".PathRootKind::Relative"):<br>
272/// No special root. The path is resolved relative to the provided \p{node}.<br>
273/// Example: <c>"foo/bar"</c>
274///
275/// - <b>Absolute paths</b> (#".PathRootKind::AbsoluteRoot"):<br>
276/// Unix-style absolute paths starting with <c>'/'</c>.<br>
277/// Tree node: positioned at tree root with path <c>'/'</c>.<br>
278/// Example: <c>"/usr/local/bin"</c> → root node
279///
280/// - <b>URL schemes</b> (#".PathRootKind::URL"):<br>
281/// Format: <c>scheme://</c> where scheme is alphanumeric with <c>+</c>, <c>-</c>, or <c>.</c>.<br>
282/// Tree node: child of root named with the scheme (e.g., <c>http</c>, <c>ftp</c>).<br>
283/// The node is marked as type #"Types::SOCKET" with state
284/// #"ScanStates::NOT_EXISTENT" (unscannable virtual node).<br>
285/// Examples: <c>http://example.com</c> → node <c>http</c>,
286/// <c>file://path</c> → node <c>file</c>
287///
288/// - <b>Windows drive letters</b> (#".PathRootKind::DriveLetter"):<br>
289/// Format: <c>C:</c>, <c>C:\</c>, or <c>C:/</c>.<br>
290/// Tree node: child of root named with the drive letter (e.g., <c>C:</c>).<br>
291/// Examples: <c>C:\\Windows</c> → node <c>"C:"</c>, <c>"D:/data"</c> → node <c>"D:"</c><br>
292/// (Windows only.)
293///
294/// - <b>UNC paths</b> (#".PathRootKind::UNC"):<br>
295/// Universal Naming Convention for network shares: <c>\\\</c> or <c>//</c>.<br>
296/// Tree node: positioned at the root with path <c>//</c>.<br>
297/// Examples: <c>\\\\server\\share</c> or <c>//server/share</c> → root node with <c>//</c>.<br>
298/// (Windows only.)
299///
300/// - <b>Windows device paths</b> (#".PathRootKind::Device"):<br>
301/// Format: <c>\\\\.\\...</c> or <c>//./...</c>.<br>
302/// Tree node: child of root with synthesized name starting with <c>DEV</c>.<br>
303/// The remaining path is encoded with colons replacing separators.<br>
304/// The node is marked as type #"Types::SOCKET" with state
305/// #"ScanStates::NOT_EXISTENT" (unscannable virtual node).<br>
306/// Examples: <c>\\\\.\\C:</c> → node <c>DEV:C:</c>,
307/// <c>\\\\.\\UNC\\server</c> → treated as UNC after normalization.<br>
308/// (Windows only.)
309///
310/// \note
311/// The detected #".PathRootKind" is returned in the result and should be preserved by callers
312/// if they need to distinguish between root types later, as the node name alone is not always
313/// sufficient to unambiguously identify the root kind (e.g., a drive letter <c>C:</c> vs.
314/// a scheme <c>c:</c> from URL <c>c://...</c>).
315///
316/// \note
317/// This function is used by the function #"ScanFiles" to resolve the start path and symbolic
318/// link targets.
319///
320/// @param[in,out] sourcePath The path to scan. This might contain <c>"."</c> and <c>".."</c>
321/// directories, as well as symbolic links.
322/// When the method exits successfully, this path is empty.
323/// Otherwise, this path-string contains the remaining
324/// path, starting with the name of the file or directory that could
325/// not be found, accessed, or otherwise be resolved.
326/// @param[in,out] node The starting node. In case the parameter \p{sourcePath} contains
327/// a root specification (absolute path, URL, etc.), this node is moved
328/// to the appropriate position in the tree (often the root).<br>
329/// When the method exits successfully, this cursor targets the
330/// file that the source path resolved to.<br>
331/// In case of failure, this cursor becomes
332/// #"TCursor::IsInvalid;invalid".
333/// @param[in,out] pathToNode This path has to point to the given \p{node} when the method is
334/// called. When the method returns, it contains the canonical path
335/// to the then repositioned \p{node}.
336/// @param[in,out] resultPaths Optional pointer to a #".CanonicalPathList" that receives nodes
337/// for newly created paths during resolution.
338/// @return A #"CanonicalResult" containing:
339/// - #"CanonicalResult::RootKind": The detected path root format
340/// - #"CanonicalResult::ScanState": The scan state indicating success or failure:<br>
341/// #"ScanStates::NONE" (new node created),
342/// #"ScanStates::DUPLICATE" (already existed),
343/// #"ScanStates::STATS" or #"ScanStates::RESOLVED" (successfully scanned),
344/// #"ScanStates::NOT_EXISTENT" (path not found), or
345/// other error states.
347 FTree::Cursor& node,
348 Path& pathToNode,
349 CanonicalPathList* resultPaths= nullptr );
350
351} // namespace alib[::filetree]
352
353
354/// Type alias in namespace #"%alib".
356
357/// Type alias in namespace #"%alib".
359
360} // namespace [alib]
#define ALIB_DLL
#define ALIB_EXPORT
ScanStates
Per-entry information about how a node was scanned.
FTValue::ScanStates ScanFiles(FTree &tree, ScanParameters &parameters, CanonicalPathList *resultPaths=nullptr, Path *remainingStart=nullptr)
PathRootKind
Classification of path root formats that cannot be directly scanned.
Definition fscanner.hpp:241
@ DriveLetter
Windows drive letter (C:, D:, etc).
Definition fscanner.hpp:245
@ Errorneous
Errorneous path format.
Definition fscanner.hpp:242
@ AbsoluteRoot
Unix-style absolute path starting with /.
Definition fscanner.hpp:244
@ Relative
Relative path (no special root).
Definition fscanner.hpp:243
@ Device
Windows device path (\\.\...).
Definition fscanner.hpp:248
@ URL
URL scheme (http://, ftp://, file://, etc).
Definition fscanner.hpp:247
@ UNC
Universal Naming Convention (\\server\share).
Definition fscanner.hpp:246
std::shared_ptr< FFilter > SPFileFilter
A shared pointer to a filter.
Definition ffilter.hpp:43
String DBG_FILETREE_SCAN_VERBOSE_LOG_FORMAT
CanonicalResult MakeCanonical(Path &sourcePath, FTree::Cursor &node, Path &pathToNode, CanonicalPathList *resultPaths=nullptr)
Definition alox.cpp:14
strings::TString< character > String
Type alias in namespace #"%alib".
Definition string.hpp:2165
system::Path Path
Type alias in namespace #"%alib".
Definition path.hpp:417
strings::TString< PathCharType > PathString
The string-type used with this ALib Module.
Definition path.hpp:34
filetree::ScanParameters ScanParameters
Type alias in namespace #"%alib".
Definition fscanner.hpp:355
filetree::CanonicalPathList CanonicalPathList
Type alias in namespace #"%alib".
Definition fscanner.hpp:358
Result information from MakeCanonical.
Definition fscanner.hpp:252
PathRootKind RootKind
What kind of path root was detected.
Definition fscanner.hpp:253
FileStatus::ScanStates ScanState
Result of canonicalization/scanning.
Definition fscanner.hpp:254
Input parameters to function #"ScanFiles(FTree&)".
Definition fscanner.hpp:19
ScanParameters(const PathString &startPath, SymbolicLinks linkTreatment=SymbolicLinks::RECURSIVE, unsigned maxDepth=InfiniteRecursion, bool crossFileSystems=true, bool includeArtificialFS=false)
Definition fscanner.hpp:105
SymbolicLinks
Options for processing symbolic links.
Definition fscanner.hpp:21
@ DONT_RESOLVE
Demands not to resolve symbolic links in any way.
Definition fscanner.hpp:22
SPFileFilter DirectoryFilterPostRecursion
Definition fscanner.hpp:85
unsigned MaxDepth
The maximum recursion depth. Defaults to #"InfiniteRecursion".
Definition fscanner.hpp:41
static constexpr unsigned InfiniteRecursion
Denotes 'infinite' recursion if set to field #"MaxDepth".
Definition fscanner.hpp:32
SymbolicLinks LinkTreatment
Denotes how symbolic links are treated.
Definition fscanner.hpp:38
Path StartPath
The path to be scanned.
Definition fscanner.hpp:35
SPFileFilter DirectoryFilterPreRecursion
Definition fscanner.hpp:97