Auto merge of #87580 - ChrisDenton:win-arg-parse-2008, r=m-ou-se

Update Windows Argument Parsing

Fixes #44650

The Windows command line is passed to applications [as a single string](https://docs.microsoft.com/en-us/archive/blogs/larryosterman/the-windows-command-line-is-just-a-string) which the application then parses to get a list of arguments. The standard rules (as used by C/C++) for parsing the command line have slightly changed over the years, most recently in 2008 which added new escaping rules.

This PR implements the new rules as [described on MSDN](https://docs.microsoft.com/en-us/cpp/cpp/main-function-command-line-args?view=msvc-160#parsing-c-command-line-arguments) and [further detailed here](https://daviddeley.com/autohotkey/parameters/parameters.htm#WIN). It has been tested against the behaviour of C++ by calling a C++ program that outputs its raw command line and the contents of `argv`. See [my repo](https://github.com/ChrisDenton/winarg/tree/std) if anyone wants to reproduce my work.

For an overview of how this PR changes argument parsing behavior and why we feel it is warranted see https://github.com/rust-lang/rust/pull/87580#issuecomment-893833893.

For some examples see: https://github.com/rust-lang/rust/pull/87580#issuecomment-894299249
This commit is contained in:
bors 2021-09-02 16:16:13 +00:00
commit 1cf8fdd4f0
4 changed files with 208 additions and 130 deletions

View File

@ -253,6 +253,7 @@
#![feature(const_ip)] #![feature(const_ip)]
#![feature(const_ipv4)] #![feature(const_ipv4)]
#![feature(const_ipv6)] #![feature(const_ipv6)]
#![feature(const_option)]
#![feature(const_raw_ptr_deref)] #![feature(const_raw_ptr_deref)]
#![feature(const_socketaddr)] #![feature(const_socketaddr)]
#![feature(const_trait_impl)] #![feature(const_trait_impl)]

View File

@ -1,13 +1,18 @@
#![allow(dead_code)] // runtime init functions not used during testing //! The Windows command line is just a string
//! <https://docs.microsoft.com/en-us/archive/blogs/larryosterman/the-windows-command-line-is-just-a-string>
//!
//! This module implements the parsing necessary to turn that string into a list of arguments.
#[cfg(test)] #[cfg(test)]
mod tests; mod tests;
use crate::ffi::OsString; use crate::ffi::OsString;
use crate::fmt; use crate::fmt;
use crate::marker::PhantomData;
use crate::num::NonZeroU16;
use crate::os::windows::prelude::*; use crate::os::windows::prelude::*;
use crate::path::PathBuf; use crate::path::PathBuf;
use crate::slice; use crate::ptr::NonNull;
use crate::sys::c; use crate::sys::c;
use crate::sys::windows::os::current_exe; use crate::sys::windows::os::current_exe;
use crate::vec; use crate::vec;
@ -15,9 +20,11 @@ use crate::vec;
use core::iter; use core::iter;
pub fn args() -> Args { pub fn args() -> Args {
// SAFETY: `GetCommandLineW` returns a pointer to a null terminated UTF-16
// string so it's safe for `WStrUnits` to use.
unsafe { unsafe {
let lp_cmd_line = c::GetCommandLineW(); let lp_cmd_line = c::GetCommandLineW();
let parsed_args_list = parse_lp_cmd_line(lp_cmd_line as *const u16, || { let parsed_args_list = parse_lp_cmd_line(WStrUnits::new(lp_cmd_line), || {
current_exe().map(PathBuf::into_os_string).unwrap_or_else(|_| OsString::new()) current_exe().map(PathBuf::into_os_string).unwrap_or_else(|_| OsString::new())
}); });
@ -28,129 +35,120 @@ pub fn args() -> Args {
/// Implements the Windows command-line argument parsing algorithm. /// Implements the Windows command-line argument parsing algorithm.
/// ///
/// Microsoft's documentation for the Windows CLI argument format can be found at /// Microsoft's documentation for the Windows CLI argument format can be found at
/// <https://docs.microsoft.com/en-us/previous-versions//17w5ykft(v=vs.85)>. /// <https://docs.microsoft.com/en-us/cpp/cpp/main-function-command-line-args?view=msvc-160#parsing-c-command-line-arguments>
/// ///
/// Windows includes a function to do this in shell32.dll, /// A more in-depth explanation is here:
/// but linking with that DLL causes the process to be registered as a GUI application. /// <https://daviddeley.com/autohotkey/parameters/parameters.htm#WIN>
///
/// Windows includes a function to do command line parsing in shell32.dll.
/// However, this is not used for two reasons:
///
/// 1. Linking with that DLL causes the process to be registered as a GUI application.
/// GUI applications add a bunch of overhead, even if no windows are drawn. See /// GUI applications add a bunch of overhead, even if no windows are drawn. See
/// <https://randomascii.wordpress.com/2018/12/03/a-not-called-function-can-cause-a-5x-slowdown/>. /// <https://randomascii.wordpress.com/2018/12/03/a-not-called-function-can-cause-a-5x-slowdown/>.
/// ///
/// This function was tested for equivalence to the shell32.dll implementation in /// 2. It does not follow the modern C/C++ argv rules outlined in the first two links above.
/// Windows 10 Pro v1803, using an exhaustive test suite available at ///
/// <https://gist.github.com/notriddle/dde431930c392e428055b2dc22e638f5> or /// This function was tested for equivalence to the C/C++ parsing rules using an
/// <https://paste.gg/p/anonymous/47d6ed5f5bd549168b1c69c799825223>. /// extensive test suite available at
unsafe fn parse_lp_cmd_line<F: Fn() -> OsString>( /// <https://github.com/ChrisDenton/winarg/tree/std>.
lp_cmd_line: *const u16, fn parse_lp_cmd_line<'a, F: Fn() -> OsString>(
lp_cmd_line: Option<WStrUnits<'a>>,
exe_name: F, exe_name: F,
) -> Vec<OsString> { ) -> Vec<OsString> {
const BACKSLASH: u16 = '\\' as u16; const BACKSLASH: NonZeroU16 = NonZeroU16::new(b'\\' as u16).unwrap();
const QUOTE: u16 = '"' as u16; const QUOTE: NonZeroU16 = NonZeroU16::new(b'"' as u16).unwrap();
const TAB: u16 = '\t' as u16; const TAB: NonZeroU16 = NonZeroU16::new(b'\t' as u16).unwrap();
const SPACE: u16 = ' ' as u16; const SPACE: NonZeroU16 = NonZeroU16::new(b' ' as u16).unwrap();
let mut ret_val = Vec::new(); let mut ret_val = Vec::new();
if lp_cmd_line.is_null() || *lp_cmd_line == 0 { // If the cmd line pointer is null or it points to an empty string then
// return the name of the executable as argv[0].
if lp_cmd_line.as_ref().and_then(|cmd| cmd.peek()).is_none() {
ret_val.push(exe_name()); ret_val.push(exe_name());
return ret_val; return ret_val;
} }
let mut cmd_line = { let mut code_units = lp_cmd_line.unwrap();
let mut end = 0;
while *lp_cmd_line.offset(end) != 0 {
end += 1;
}
slice::from_raw_parts(lp_cmd_line, end as usize)
};
// The executable name at the beginning is special. // The executable name at the beginning is special.
cmd_line = match cmd_line[0] {
// The executable name ends at the next quote mark,
// no matter what.
QUOTE => {
let args = {
let mut cut = cmd_line[1..].splitn(2, |&c| c == QUOTE);
if let Some(exe) = cut.next() {
ret_val.push(OsString::from_wide(exe));
}
cut.next()
};
if let Some(args) = args {
args
} else {
return ret_val;
}
}
// Implement quirk: when they say whitespace here,
// they include the entire ASCII control plane:
// "However, if lpCmdLine starts with any amount of whitespace, CommandLineToArgvW
// will consider the first argument to be an empty string. Excess whitespace at the
// end of lpCmdLine is ignored."
0..=SPACE => {
ret_val.push(OsString::new());
&cmd_line[1..]
}
// The executable name ends at the next whitespace,
// no matter what.
_ => {
let args = {
let mut cut = cmd_line.splitn(2, |&c| c > 0 && c <= SPACE);
if let Some(exe) = cut.next() {
ret_val.push(OsString::from_wide(exe));
}
cut.next()
};
if let Some(args) = args {
args
} else {
return ret_val;
}
}
};
let mut cur = Vec::new();
let mut in_quotes = false; let mut in_quotes = false;
let mut was_in_quotes = false; let mut cur = Vec::new();
let mut backslash_count: usize = 0; for w in &mut code_units {
for &c in cmd_line { match w {
match c { // A quote mark always toggles `in_quotes` no matter what because
// backslash // there are no escape characters when parsing the executable name.
BACKSLASH => { QUOTE => in_quotes = !in_quotes,
backslash_count += 1; // If not `in_quotes` then whitespace ends argv[0].
was_in_quotes = false; SPACE | TAB if !in_quotes => break,
} // In all other cases the code unit is taken literally.
QUOTE if backslash_count % 2 == 0 => { _ => cur.push(w.get()),
cur.extend(iter::repeat(b'\\' as u16).take(backslash_count / 2));
backslash_count = 0;
if was_in_quotes {
cur.push('"' as u16);
was_in_quotes = false;
} else {
was_in_quotes = in_quotes;
in_quotes = !in_quotes;
}
}
QUOTE if backslash_count % 2 != 0 => {
cur.extend(iter::repeat(b'\\' as u16).take(backslash_count / 2));
backslash_count = 0;
was_in_quotes = false;
cur.push(b'"' as u16);
}
SPACE | TAB if !in_quotes => {
cur.extend(iter::repeat(b'\\' as u16).take(backslash_count));
if !cur.is_empty() || was_in_quotes {
ret_val.push(OsString::from_wide(&cur[..]));
cur.truncate(0);
}
backslash_count = 0;
was_in_quotes = false;
}
_ => {
cur.extend(iter::repeat(b'\\' as u16).take(backslash_count));
backslash_count = 0;
was_in_quotes = false;
cur.push(c);
}
} }
} }
cur.extend(iter::repeat(b'\\' as u16).take(backslash_count)); // Skip whitespace.
// include empty quoted strings at the end of the arguments list code_units.advance_while(|w| w == SPACE || w == TAB);
if !cur.is_empty() || was_in_quotes || in_quotes { ret_val.push(OsString::from_wide(&cur));
// Parse the arguments according to these rules:
// * All code units are taken literally except space, tab, quote and backslash.
// * When not `in_quotes`, space and tab separate arguments. Consecutive spaces and tabs are
// treated as a single separator.
// * A space or tab `in_quotes` is taken literally.
// * A quote toggles `in_quotes` mode unless it's escaped. An escaped quote is taken literally.
// * A quote can be escaped if preceded by an odd number of backslashes.
// * If any number of backslashes is immediately followed by a quote then the number of
// backslashes is halved (rounding down).
// * Backslashes not followed by a quote are all taken literally.
// * If `in_quotes` then a quote can also be escaped using another quote
// (i.e. two consecutive quotes become one literal quote).
let mut cur = Vec::new();
let mut in_quotes = false;
while let Some(w) = code_units.next() {
match w {
// If not `in_quotes`, a space or tab ends the argument.
SPACE | TAB if !in_quotes => {
ret_val.push(OsString::from_wide(&cur[..]));
cur.truncate(0);
// Skip whitespace.
code_units.advance_while(|w| w == SPACE || w == TAB);
}
// Backslashes can escape quotes or backslashes but only if consecutive backslashes are followed by a quote.
BACKSLASH => {
let backslash_count = code_units.advance_while(|w| w == BACKSLASH) + 1;
if code_units.peek() == Some(QUOTE) {
cur.extend(iter::repeat(BACKSLASH.get()).take(backslash_count / 2));
// The quote is escaped if there are an odd number of backslashes.
if backslash_count % 2 == 1 {
code_units.next();
cur.push(QUOTE.get());
}
} else {
// If there is no quote on the end then there is no escaping.
cur.extend(iter::repeat(BACKSLASH.get()).take(backslash_count));
}
}
// If `in_quotes` and not backslash escaped (see above) then a quote either
// unsets `in_quote` or is escaped by another quote.
QUOTE if in_quotes => match code_units.peek() {
// Two consecutive quotes when `in_quotes` produces one literal quote.
Some(QUOTE) => {
cur.push(QUOTE.get());
code_units.next();
}
// Otherwise set `in_quotes`.
Some(_) => in_quotes = false,
// The end of the command line.
// Push `cur` even if empty, which we do by breaking while `in_quotes` is still set.
None => break,
},
// If not `in_quotes` and not BACKSLASH escaped (see above) then a quote sets `in_quote`.
QUOTE => in_quotes = true,
// Everything else is always taken literally.
_ => cur.push(w.get()),
}
}
// Push the final argument, if any.
if !cur.is_empty() || in_quotes {
ret_val.push(OsString::from_wide(&cur[..])); ret_val.push(OsString::from_wide(&cur[..]));
} }
ret_val ret_val
@ -187,3 +185,52 @@ impl ExactSizeIterator for Args {
self.parsed_args_list.len() self.parsed_args_list.len()
} }
} }
/// A safe iterator over a LPWSTR
/// (aka a pointer to a series of UTF-16 code units terminated by a NULL).
struct WStrUnits<'a> {
// The pointer must never be null...
lpwstr: NonNull<u16>,
// ...and the memory it points to must be valid for this lifetime.
lifetime: PhantomData<&'a [u16]>,
}
impl WStrUnits<'_> {
/// Create the iterator. Returns `None` if `lpwstr` is null.
///
/// SAFETY: `lpwstr` must point to a null-terminated wide string that lives
/// at least as long as the lifetime of this struct.
unsafe fn new(lpwstr: *const u16) -> Option<Self> {
Some(Self { lpwstr: NonNull::new(lpwstr as _)?, lifetime: PhantomData })
}
fn peek(&self) -> Option<NonZeroU16> {
// SAFETY: It's always safe to read the current item because we don't
// ever move out of the array's bounds.
unsafe { NonZeroU16::new(*self.lpwstr.as_ptr()) }
}
/// Advance the iterator while `predicate` returns true.
/// Returns the number of items it advanced by.
fn advance_while<P: FnMut(NonZeroU16) -> bool>(&mut self, mut predicate: P) -> usize {
let mut counter = 0;
while let Some(w) = self.peek() {
if !predicate(w) {
break;
}
counter += 1;
self.next();
}
counter
}
}
impl Iterator for WStrUnits<'_> {
// This can never return zero as that marks the end of the string.
type Item = NonZeroU16;
fn next(&mut self) -> Option<NonZeroU16> {
// SAFETY: If NULL is reached we immediately return.
// Therefore it's safe to advance the pointer after that.
unsafe {
let next = self.peek()?;
self.lpwstr = NonNull::new_unchecked(self.lpwstr.as_ptr().add(1));
Some(next)
}
}
}

View File

@ -5,9 +5,9 @@ fn chk(string: &str, parts: &[&str]) {
let mut wide: Vec<u16> = OsString::from(string).encode_wide().collect(); let mut wide: Vec<u16> = OsString::from(string).encode_wide().collect();
wide.push(0); wide.push(0);
let parsed = let parsed =
unsafe { parse_lp_cmd_line(wide.as_ptr() as *const u16, || OsString::from("TEST.EXE")) }; unsafe { parse_lp_cmd_line(WStrUnits::new(wide.as_ptr()), || OsString::from("TEST.EXE")) };
let expected: Vec<OsString> = parts.iter().map(|k| OsString::from(k)).collect(); let expected: Vec<OsString> = parts.iter().map(|k| OsString::from(k)).collect();
assert_eq!(parsed.as_slice(), expected.as_slice()); assert_eq!(parsed.as_slice(), expected.as_slice(), "{:?}", string);
} }
#[test] #[test]
@ -27,35 +27,65 @@ fn single_words() {
#[test] #[test]
fn official_examples() { fn official_examples() {
chk(r#"EXE "abc" d e"#, &["EXE", "abc", "d", "e"]); chk(r#"EXE "abc" d e"#, &["EXE", "abc", "d", "e"]);
chk(r#"EXE a\\\b d"e f"g h"#, &["EXE", r#"a\\\b"#, "de fg", "h"]); chk(r#"EXE a\\\b d"e f"g h"#, &["EXE", r"a\\\b", "de fg", "h"]);
chk(r#"EXE a\\\"b c d"#, &["EXE", r#"a\"b"#, "c", "d"]); chk(r#"EXE a\\\"b c d"#, &["EXE", r#"a\"b"#, "c", "d"]);
chk(r#"EXE a\\\\"b c" d e"#, &["EXE", r#"a\\b c"#, "d", "e"]); chk(r#"EXE a\\\\"b c" d e"#, &["EXE", r"a\\b c", "d", "e"]);
} }
#[test] #[test]
fn whitespace_behavior() { fn whitespace_behavior() {
chk(r#" test"#, &["", "test"]); chk(" test", &["", "test"]);
chk(r#" test"#, &["", "test"]); chk(" test", &["", "test"]);
chk(r#" test test2"#, &["", "test", "test2"]); chk(" test test2", &["", "test", "test2"]);
chk(r#" test test2"#, &["", "test", "test2"]); chk(" test test2", &["", "test", "test2"]);
chk(r#"test test2 "#, &["test", "test2"]); chk("test test2 ", &["test", "test2"]);
chk(r#"test test2 "#, &["test", "test2"]); chk("test test2 ", &["test", "test2"]);
chk(r#"test "#, &["test"]); chk("test ", &["test"]);
} }
#[test] #[test]
fn genius_quotes() { fn genius_quotes() {
chk(r#"EXE "" """#, &["EXE", "", ""]); chk(r#"EXE "" """#, &["EXE", "", ""]);
chk(r#"EXE "" """"#, &["EXE", "", "\""]); chk(r#"EXE "" """"#, &["EXE", "", r#"""#]);
chk( chk(
r#"EXE "this is """all""" in the same argument""#, r#"EXE "this is """all""" in the same argument""#,
&["EXE", "this is \"all\" in the same argument"], &["EXE", r#"this is "all" in the same argument"#],
); );
chk(r#"EXE "a"""#, &["EXE", "a\""]); chk(r#"EXE "a"""#, &["EXE", r#"a""#]);
chk(r#"EXE "a"" a"#, &["EXE", "a\"", "a"]); chk(r#"EXE "a"" a"#, &["EXE", r#"a" a"#]);
// quotes cannot be escaped in command names // quotes cannot be escaped in command names
chk(r#""EXE" check"#, &["EXE", "check"]); chk(r#""EXE" check"#, &["EXE", "check"]);
chk(r#""EXE check""#, &["EXE check"]); chk(r#""EXE check""#, &["EXE check"]);
chk(r#""EXE """for""" check"#, &["EXE ", r#"for""#, "check"]); chk(r#""EXE """for""" check"#, &["EXE for check"]);
chk(r#""EXE \"for\" check"#, &[r#"EXE \"#, r#"for""#, "check"]); chk(r#""EXE \"for\" check"#, &[r"EXE \for\ check"]);
chk(r#""EXE \" for \" check"#, &[r"EXE \", "for", r#"""#, "check"]);
chk(r#"E"X"E test"#, &["EXE", "test"]);
chk(r#"EX""E test"#, &["EXE", "test"]);
}
// from https://daviddeley.com/autohotkey/parameters/parameters.htm#WINCRULESEX
#[test]
fn post_2008() {
chk("EXE CallMeIshmael", &["EXE", "CallMeIshmael"]);
chk(r#"EXE "Call Me Ishmael""#, &["EXE", "Call Me Ishmael"]);
chk(r#"EXE Cal"l Me I"shmael"#, &["EXE", "Call Me Ishmael"]);
chk(r#"EXE CallMe\"Ishmael"#, &["EXE", r#"CallMe"Ishmael"#]);
chk(r#"EXE "CallMe\"Ishmael""#, &["EXE", r#"CallMe"Ishmael"#]);
chk(r#"EXE "Call Me Ishmael\\""#, &["EXE", r"Call Me Ishmael\"]);
chk(r#"EXE "CallMe\\\"Ishmael""#, &["EXE", r#"CallMe\"Ishmael"#]);
chk(r#"EXE a\\\b"#, &["EXE", r"a\\\b"]);
chk(r#"EXE "a\\\b""#, &["EXE", r"a\\\b"]);
chk(r#"EXE "\"Call Me Ishmael\"""#, &["EXE", r#""Call Me Ishmael""#]);
chk(r#"EXE "C:\TEST A\\""#, &["EXE", r"C:\TEST A\"]);
chk(r#"EXE "\"C:\TEST A\\\"""#, &["EXE", r#""C:\TEST A\""#]);
chk(r#"EXE "a b c" d e"#, &["EXE", "a b c", "d", "e"]);
chk(r#"EXE "ab\"c" "\\" d"#, &["EXE", r#"ab"c"#, r"\", "d"]);
chk(r#"EXE a\\\b d"e f"g h"#, &["EXE", r"a\\\b", "de fg", "h"]);
chk(r#"EXE a\\\"b c d"#, &["EXE", r#"a\"b"#, "c", "d"]);
chk(r#"EXE a\\\\"b c" d e"#, &["EXE", r"a\\b c", "d", "e"]);
// Double Double Quotes
chk(r#"EXE "a b c"""#, &["EXE", r#"a b c""#]);
chk(r#"EXE """CallMeIshmael""" b c"#, &["EXE", r#""CallMeIshmael""#, "b", "c"]);
chk(r#"EXE """Call Me Ishmael""""#, &["EXE", r#""Call Me Ishmael""#]);
chk(r#"EXE """"Call Me Ishmael"" b c"#, &["EXE", r#""Call"#, "Me", "Ishmael", "b", "c"]);
} }

View File

@ -789,7 +789,7 @@ extern "system" {
pub fn RemoveDirectoryW(lpPathName: LPCWSTR) -> BOOL; pub fn RemoveDirectoryW(lpPathName: LPCWSTR) -> BOOL;
pub fn SetFileAttributesW(lpFileName: LPCWSTR, dwFileAttributes: DWORD) -> BOOL; pub fn SetFileAttributesW(lpFileName: LPCWSTR, dwFileAttributes: DWORD) -> BOOL;
pub fn SetLastError(dwErrCode: DWORD); pub fn SetLastError(dwErrCode: DWORD);
pub fn GetCommandLineW() -> *mut LPCWSTR; pub fn GetCommandLineW() -> LPWSTR;
pub fn GetTempPathW(nBufferLength: DWORD, lpBuffer: LPCWSTR) -> DWORD; pub fn GetTempPathW(nBufferLength: DWORD, lpBuffer: LPCWSTR) -> DWORD;
pub fn GetCurrentProcess() -> HANDLE; pub fn GetCurrentProcess() -> HANDLE;
pub fn GetCurrentThread() -> HANDLE; pub fn GetCurrentThread() -> HANDLE;