Auto merge of #87580 - ChrisDenton:win-arg-parse-2008, r=m-ou-se

Update Windows Argument Parsing

Fixes #44650

The Windows command line is passed to applications [as a single string](https://docs.microsoft.com/en-us/archive/blogs/larryosterman/the-windows-command-line-is-just-a-string) which the application then parses to get a list of arguments. The standard rules (as used by C/C++) for parsing the command line have slightly changed over the years, most recently in 2008 which added new escaping rules.

This PR implements the new rules as [described on MSDN](https://docs.microsoft.com/en-us/cpp/cpp/main-function-command-line-args?view=msvc-160#parsing-c-command-line-arguments) and [further detailed here](https://daviddeley.com/autohotkey/parameters/parameters.htm#WIN). It has been tested against the behaviour of C++ by calling a C++ program that outputs its raw command line and the contents of `argv`. See [my repo](https://github.com/ChrisDenton/winarg/tree/std) if anyone wants to reproduce my work.

For an overview of how this PR changes argument parsing behavior and why we feel it is warranted see https://github.com/rust-lang/rust/pull/87580#issuecomment-893833893.

For some examples see: https://github.com/rust-lang/rust/pull/87580#issuecomment-894299249
This commit is contained in:
bors 2021-09-02 16:16:13 +00:00
commit 1cf8fdd4f0
4 changed files with 208 additions and 130 deletions

View File

@ -253,6 +253,7 @@
#![feature(const_ip)]
#![feature(const_ipv4)]
#![feature(const_ipv6)]
#![feature(const_option)]
#![feature(const_raw_ptr_deref)]
#![feature(const_socketaddr)]
#![feature(const_trait_impl)]

View File

@ -1,13 +1,18 @@
#![allow(dead_code)] // runtime init functions not used during testing
//! The Windows command line is just a string
//! <https://docs.microsoft.com/en-us/archive/blogs/larryosterman/the-windows-command-line-is-just-a-string>
//!
//! This module implements the parsing necessary to turn that string into a list of arguments.
#[cfg(test)]
mod tests;
use crate::ffi::OsString;
use crate::fmt;
use crate::marker::PhantomData;
use crate::num::NonZeroU16;
use crate::os::windows::prelude::*;
use crate::path::PathBuf;
use crate::slice;
use crate::ptr::NonNull;
use crate::sys::c;
use crate::sys::windows::os::current_exe;
use crate::vec;
@ -15,9 +20,11 @@ use crate::vec;
use core::iter;
pub fn args() -> Args {
// SAFETY: `GetCommandLineW` returns a pointer to a null terminated UTF-16
// string so it's safe for `WStrUnits` to use.
unsafe {
let lp_cmd_line = c::GetCommandLineW();
let parsed_args_list = parse_lp_cmd_line(lp_cmd_line as *const u16, || {
let parsed_args_list = parse_lp_cmd_line(WStrUnits::new(lp_cmd_line), || {
current_exe().map(PathBuf::into_os_string).unwrap_or_else(|_| OsString::new())
});
@ -28,129 +35,120 @@ pub fn args() -> Args {
/// Implements the Windows command-line argument parsing algorithm.
///
/// Microsoft's documentation for the Windows CLI argument format can be found at
/// <https://docs.microsoft.com/en-us/previous-versions//17w5ykft(v=vs.85)>.
/// <https://docs.microsoft.com/en-us/cpp/cpp/main-function-command-line-args?view=msvc-160#parsing-c-command-line-arguments>
///
/// Windows includes a function to do this in shell32.dll,
/// but linking with that DLL causes the process to be registered as a GUI application.
/// A more in-depth explanation is here:
/// <https://daviddeley.com/autohotkey/parameters/parameters.htm#WIN>
///
/// Windows includes a function to do command line parsing in shell32.dll.
/// However, this is not used for two reasons:
///
/// 1. Linking with that DLL causes the process to be registered as a GUI application.
/// GUI applications add a bunch of overhead, even if no windows are drawn. See
/// <https://randomascii.wordpress.com/2018/12/03/a-not-called-function-can-cause-a-5x-slowdown/>.
///
/// This function was tested for equivalence to the shell32.dll implementation in
/// Windows 10 Pro v1803, using an exhaustive test suite available at
/// <https://gist.github.com/notriddle/dde431930c392e428055b2dc22e638f5> or
/// <https://paste.gg/p/anonymous/47d6ed5f5bd549168b1c69c799825223>.
unsafe fn parse_lp_cmd_line<F: Fn() -> OsString>(
lp_cmd_line: *const u16,
/// 2. It does not follow the modern C/C++ argv rules outlined in the first two links above.
///
/// This function was tested for equivalence to the C/C++ parsing rules using an
/// extensive test suite available at
/// <https://github.com/ChrisDenton/winarg/tree/std>.
fn parse_lp_cmd_line<'a, F: Fn() -> OsString>(
lp_cmd_line: Option<WStrUnits<'a>>,
exe_name: F,
) -> Vec<OsString> {
const BACKSLASH: u16 = '\\' as u16;
const QUOTE: u16 = '"' as u16;
const TAB: u16 = '\t' as u16;
const SPACE: u16 = ' ' as u16;
const BACKSLASH: NonZeroU16 = NonZeroU16::new(b'\\' as u16).unwrap();
const QUOTE: NonZeroU16 = NonZeroU16::new(b'"' as u16).unwrap();
const TAB: NonZeroU16 = NonZeroU16::new(b'\t' as u16).unwrap();
const SPACE: NonZeroU16 = NonZeroU16::new(b' ' as u16).unwrap();
let mut ret_val = Vec::new();
if lp_cmd_line.is_null() || *lp_cmd_line == 0 {
// If the cmd line pointer is null or it points to an empty string then
// return the name of the executable as argv[0].
if lp_cmd_line.as_ref().and_then(|cmd| cmd.peek()).is_none() {
ret_val.push(exe_name());
return ret_val;
}
let mut cmd_line = {
let mut end = 0;
while *lp_cmd_line.offset(end) != 0 {
end += 1;
}
slice::from_raw_parts(lp_cmd_line, end as usize)
};
let mut code_units = lp_cmd_line.unwrap();
// The executable name at the beginning is special.
cmd_line = match cmd_line[0] {
// The executable name ends at the next quote mark,
// no matter what.
QUOTE => {
let args = {
let mut cut = cmd_line[1..].splitn(2, |&c| c == QUOTE);
if let Some(exe) = cut.next() {
ret_val.push(OsString::from_wide(exe));
}
cut.next()
};
if let Some(args) = args {
args
} else {
return ret_val;
}
}
// Implement quirk: when they say whitespace here,
// they include the entire ASCII control plane:
// "However, if lpCmdLine starts with any amount of whitespace, CommandLineToArgvW
// will consider the first argument to be an empty string. Excess whitespace at the
// end of lpCmdLine is ignored."
0..=SPACE => {
ret_val.push(OsString::new());
&cmd_line[1..]
}
// The executable name ends at the next whitespace,
// no matter what.
_ => {
let args = {
let mut cut = cmd_line.splitn(2, |&c| c > 0 && c <= SPACE);
if let Some(exe) = cut.next() {
ret_val.push(OsString::from_wide(exe));
}
cut.next()
};
if let Some(args) = args {
args
} else {
return ret_val;
}
}
};
let mut cur = Vec::new();
let mut in_quotes = false;
let mut was_in_quotes = false;
let mut backslash_count: usize = 0;
for &c in cmd_line {
match c {
// backslash
BACKSLASH => {
backslash_count += 1;
was_in_quotes = false;
}
QUOTE if backslash_count % 2 == 0 => {
cur.extend(iter::repeat(b'\\' as u16).take(backslash_count / 2));
backslash_count = 0;
if was_in_quotes {
cur.push('"' as u16);
was_in_quotes = false;
} else {
was_in_quotes = in_quotes;
in_quotes = !in_quotes;
}
}
QUOTE if backslash_count % 2 != 0 => {
cur.extend(iter::repeat(b'\\' as u16).take(backslash_count / 2));
backslash_count = 0;
was_in_quotes = false;
cur.push(b'"' as u16);
}
SPACE | TAB if !in_quotes => {
cur.extend(iter::repeat(b'\\' as u16).take(backslash_count));
if !cur.is_empty() || was_in_quotes {
ret_val.push(OsString::from_wide(&cur[..]));
cur.truncate(0);
}
backslash_count = 0;
was_in_quotes = false;
}
_ => {
cur.extend(iter::repeat(b'\\' as u16).take(backslash_count));
backslash_count = 0;
was_in_quotes = false;
cur.push(c);
}
let mut cur = Vec::new();
for w in &mut code_units {
match w {
// A quote mark always toggles `in_quotes` no matter what because
// there are no escape characters when parsing the executable name.
QUOTE => in_quotes = !in_quotes,
// If not `in_quotes` then whitespace ends argv[0].
SPACE | TAB if !in_quotes => break,
// In all other cases the code unit is taken literally.
_ => cur.push(w.get()),
}
}
cur.extend(iter::repeat(b'\\' as u16).take(backslash_count));
// include empty quoted strings at the end of the arguments list
if !cur.is_empty() || was_in_quotes || in_quotes {
// Skip whitespace.
code_units.advance_while(|w| w == SPACE || w == TAB);
ret_val.push(OsString::from_wide(&cur));
// Parse the arguments according to these rules:
// * All code units are taken literally except space, tab, quote and backslash.
// * When not `in_quotes`, space and tab separate arguments. Consecutive spaces and tabs are
// treated as a single separator.
// * A space or tab `in_quotes` is taken literally.
// * A quote toggles `in_quotes` mode unless it's escaped. An escaped quote is taken literally.
// * A quote can be escaped if preceded by an odd number of backslashes.
// * If any number of backslashes is immediately followed by a quote then the number of
// backslashes is halved (rounding down).
// * Backslashes not followed by a quote are all taken literally.
// * If `in_quotes` then a quote can also be escaped using another quote
// (i.e. two consecutive quotes become one literal quote).
let mut cur = Vec::new();
let mut in_quotes = false;
while let Some(w) = code_units.next() {
match w {
// If not `in_quotes`, a space or tab ends the argument.
SPACE | TAB if !in_quotes => {
ret_val.push(OsString::from_wide(&cur[..]));
cur.truncate(0);
// Skip whitespace.
code_units.advance_while(|w| w == SPACE || w == TAB);
}
// Backslashes can escape quotes or backslashes but only if consecutive backslashes are followed by a quote.
BACKSLASH => {
let backslash_count = code_units.advance_while(|w| w == BACKSLASH) + 1;
if code_units.peek() == Some(QUOTE) {
cur.extend(iter::repeat(BACKSLASH.get()).take(backslash_count / 2));
// The quote is escaped if there are an odd number of backslashes.
if backslash_count % 2 == 1 {
code_units.next();
cur.push(QUOTE.get());
}
} else {
// If there is no quote on the end then there is no escaping.
cur.extend(iter::repeat(BACKSLASH.get()).take(backslash_count));
}
}
// If `in_quotes` and not backslash escaped (see above) then a quote either
// unsets `in_quote` or is escaped by another quote.
QUOTE if in_quotes => match code_units.peek() {
// Two consecutive quotes when `in_quotes` produces one literal quote.
Some(QUOTE) => {
cur.push(QUOTE.get());
code_units.next();
}
// Otherwise set `in_quotes`.
Some(_) => in_quotes = false,
// The end of the command line.
// Push `cur` even if empty, which we do by breaking while `in_quotes` is still set.
None => break,
},
// If not `in_quotes` and not BACKSLASH escaped (see above) then a quote sets `in_quote`.
QUOTE => in_quotes = true,
// Everything else is always taken literally.
_ => cur.push(w.get()),
}
}
// Push the final argument, if any.
if !cur.is_empty() || in_quotes {
ret_val.push(OsString::from_wide(&cur[..]));
}
ret_val
@ -187,3 +185,52 @@ impl ExactSizeIterator for Args {
self.parsed_args_list.len()
}
}
/// A safe iterator over a LPWSTR
/// (aka a pointer to a series of UTF-16 code units terminated by a NULL).
struct WStrUnits<'a> {
// The pointer must never be null...
lpwstr: NonNull<u16>,
// ...and the memory it points to must be valid for this lifetime.
lifetime: PhantomData<&'a [u16]>,
}
impl WStrUnits<'_> {
/// Create the iterator. Returns `None` if `lpwstr` is null.
///
/// SAFETY: `lpwstr` must point to a null-terminated wide string that lives
/// at least as long as the lifetime of this struct.
unsafe fn new(lpwstr: *const u16) -> Option<Self> {
Some(Self { lpwstr: NonNull::new(lpwstr as _)?, lifetime: PhantomData })
}
fn peek(&self) -> Option<NonZeroU16> {
// SAFETY: It's always safe to read the current item because we don't
// ever move out of the array's bounds.
unsafe { NonZeroU16::new(*self.lpwstr.as_ptr()) }
}
/// Advance the iterator while `predicate` returns true.
/// Returns the number of items it advanced by.
fn advance_while<P: FnMut(NonZeroU16) -> bool>(&mut self, mut predicate: P) -> usize {
let mut counter = 0;
while let Some(w) = self.peek() {
if !predicate(w) {
break;
}
counter += 1;
self.next();
}
counter
}
}
impl Iterator for WStrUnits<'_> {
// This can never return zero as that marks the end of the string.
type Item = NonZeroU16;
fn next(&mut self) -> Option<NonZeroU16> {
// SAFETY: If NULL is reached we immediately return.
// Therefore it's safe to advance the pointer after that.
unsafe {
let next = self.peek()?;
self.lpwstr = NonNull::new_unchecked(self.lpwstr.as_ptr().add(1));
Some(next)
}
}
}

View File

@ -5,9 +5,9 @@ fn chk(string: &str, parts: &[&str]) {
let mut wide: Vec<u16> = OsString::from(string).encode_wide().collect();
wide.push(0);
let parsed =
unsafe { parse_lp_cmd_line(wide.as_ptr() as *const u16, || OsString::from("TEST.EXE")) };
unsafe { parse_lp_cmd_line(WStrUnits::new(wide.as_ptr()), || OsString::from("TEST.EXE")) };
let expected: Vec<OsString> = parts.iter().map(|k| OsString::from(k)).collect();
assert_eq!(parsed.as_slice(), expected.as_slice());
assert_eq!(parsed.as_slice(), expected.as_slice(), "{:?}", string);
}
#[test]
@ -27,35 +27,65 @@ fn single_words() {
#[test]
fn official_examples() {
chk(r#"EXE "abc" d e"#, &["EXE", "abc", "d", "e"]);
chk(r#"EXE a\\\b d"e f"g h"#, &["EXE", r#"a\\\b"#, "de fg", "h"]);
chk(r#"EXE a\\\b d"e f"g h"#, &["EXE", r"a\\\b", "de fg", "h"]);
chk(r#"EXE a\\\"b c d"#, &["EXE", r#"a\"b"#, "c", "d"]);
chk(r#"EXE a\\\\"b c" d e"#, &["EXE", r#"a\\b c"#, "d", "e"]);
chk(r#"EXE a\\\\"b c" d e"#, &["EXE", r"a\\b c", "d", "e"]);
}
#[test]
fn whitespace_behavior() {
chk(r#" test"#, &["", "test"]);
chk(r#" test"#, &["", "test"]);
chk(r#" test test2"#, &["", "test", "test2"]);
chk(r#" test test2"#, &["", "test", "test2"]);
chk(r#"test test2 "#, &["test", "test2"]);
chk(r#"test test2 "#, &["test", "test2"]);
chk(r#"test "#, &["test"]);
chk(" test", &["", "test"]);
chk(" test", &["", "test"]);
chk(" test test2", &["", "test", "test2"]);
chk(" test test2", &["", "test", "test2"]);
chk("test test2 ", &["test", "test2"]);
chk("test test2 ", &["test", "test2"]);
chk("test ", &["test"]);
}
#[test]
fn genius_quotes() {
chk(r#"EXE "" """#, &["EXE", "", ""]);
chk(r#"EXE "" """"#, &["EXE", "", "\""]);
chk(r#"EXE "" """"#, &["EXE", "", r#"""#]);
chk(
r#"EXE "this is """all""" in the same argument""#,
&["EXE", "this is \"all\" in the same argument"],
&["EXE", r#"this is "all" in the same argument"#],
);
chk(r#"EXE "a"""#, &["EXE", "a\""]);
chk(r#"EXE "a"" a"#, &["EXE", "a\"", "a"]);
chk(r#"EXE "a"""#, &["EXE", r#"a""#]);
chk(r#"EXE "a"" a"#, &["EXE", r#"a" a"#]);
// quotes cannot be escaped in command names
chk(r#""EXE" check"#, &["EXE", "check"]);
chk(r#""EXE check""#, &["EXE check"]);
chk(r#""EXE """for""" check"#, &["EXE ", r#"for""#, "check"]);
chk(r#""EXE \"for\" check"#, &[r#"EXE \"#, r#"for""#, "check"]);
chk(r#""EXE """for""" check"#, &["EXE for check"]);
chk(r#""EXE \"for\" check"#, &[r"EXE \for\ check"]);
chk(r#""EXE \" for \" check"#, &[r"EXE \", "for", r#"""#, "check"]);
chk(r#"E"X"E test"#, &["EXE", "test"]);
chk(r#"EX""E test"#, &["EXE", "test"]);
}
// from https://daviddeley.com/autohotkey/parameters/parameters.htm#WINCRULESEX
#[test]
fn post_2008() {
chk("EXE CallMeIshmael", &["EXE", "CallMeIshmael"]);
chk(r#"EXE "Call Me Ishmael""#, &["EXE", "Call Me Ishmael"]);
chk(r#"EXE Cal"l Me I"shmael"#, &["EXE", "Call Me Ishmael"]);
chk(r#"EXE CallMe\"Ishmael"#, &["EXE", r#"CallMe"Ishmael"#]);
chk(r#"EXE "CallMe\"Ishmael""#, &["EXE", r#"CallMe"Ishmael"#]);
chk(r#"EXE "Call Me Ishmael\\""#, &["EXE", r"Call Me Ishmael\"]);
chk(r#"EXE "CallMe\\\"Ishmael""#, &["EXE", r#"CallMe\"Ishmael"#]);
chk(r#"EXE a\\\b"#, &["EXE", r"a\\\b"]);
chk(r#"EXE "a\\\b""#, &["EXE", r"a\\\b"]);
chk(r#"EXE "\"Call Me Ishmael\"""#, &["EXE", r#""Call Me Ishmael""#]);
chk(r#"EXE "C:\TEST A\\""#, &["EXE", r"C:\TEST A\"]);
chk(r#"EXE "\"C:\TEST A\\\"""#, &["EXE", r#""C:\TEST A\""#]);
chk(r#"EXE "a b c" d e"#, &["EXE", "a b c", "d", "e"]);
chk(r#"EXE "ab\"c" "\\" d"#, &["EXE", r#"ab"c"#, r"\", "d"]);
chk(r#"EXE a\\\b d"e f"g h"#, &["EXE", r"a\\\b", "de fg", "h"]);
chk(r#"EXE a\\\"b c d"#, &["EXE", r#"a\"b"#, "c", "d"]);
chk(r#"EXE a\\\\"b c" d e"#, &["EXE", r"a\\b c", "d", "e"]);
// Double Double Quotes
chk(r#"EXE "a b c"""#, &["EXE", r#"a b c""#]);
chk(r#"EXE """CallMeIshmael""" b c"#, &["EXE", r#""CallMeIshmael""#, "b", "c"]);
chk(r#"EXE """Call Me Ishmael""""#, &["EXE", r#""Call Me Ishmael""#]);
chk(r#"EXE """"Call Me Ishmael"" b c"#, &["EXE", r#""Call"#, "Me", "Ishmael", "b", "c"]);
}

View File

@ -789,7 +789,7 @@ extern "system" {
pub fn RemoveDirectoryW(lpPathName: LPCWSTR) -> BOOL;
pub fn SetFileAttributesW(lpFileName: LPCWSTR, dwFileAttributes: DWORD) -> BOOL;
pub fn SetLastError(dwErrCode: DWORD);
pub fn GetCommandLineW() -> *mut LPCWSTR;
pub fn GetCommandLineW() -> LPWSTR;
pub fn GetTempPathW(nBufferLength: DWORD, lpBuffer: LPCWSTR) -> DWORD;
pub fn GetCurrentProcess() -> HANDLE;
pub fn GetCurrentThread() -> HANDLE;