-
-
Notifications
You must be signed in to change notification settings - Fork 788
Add mixed line ending hook #218
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
56d4543
3d4fb41
16b7c7a
afaa97c
5186664
93194b9
b2b0d59
ad0062a
466f9e1
aaf134c
0a8b929
2b28f4f
22b2282
f477582
b1294b8
4270b56
a1ffbfa
c6c4c4a
614893f
a1e1421
609d011
63bb1fd
3dbeeee
d0016c5
ba63d1b
2b6ad97
d16d04a
8bc4af4
4fc9624
1937788
560e1c2
0335ebf
55658c4
ab2a849
41ff0e1
eb0c3ba
f795097
f58b552
4be276c
ef4a323
4d3d8e1
f9915cb
0e223bc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,212 @@ | ||
| import argparse | ||
| import re | ||
| import sys | ||
|
|
||
| from enum import Enum | ||
|
|
||
|
|
||
| class LineEnding(Enum): | ||
| CR = b'\r', 'cr', re.compile(b'\r(?!\n)', re.DOTALL) | ||
| CRLF = b'\r\n', 'crlf', re.compile(b'\r\n', re.DOTALL) | ||
| LF = b'\n', 'lf', re.compile(b'(?<!\r)\n', re.DOTALL) | ||
|
|
||
| def __init__(self, string, opt_name, regex): | ||
| self.string = string | ||
| self.str_print = repr(string) | ||
| self.opt_name = opt_name | ||
| self.regex = regex | ||
|
|
||
|
|
||
| class MixedLineEndingOption(Enum): | ||
| AUTO = 'auto', None | ||
| NO = 'no', None | ||
| CRLF = LineEnding.CRLF.opt_name, LineEnding.CRLF | ||
| LF = LineEnding.LF.opt_name, LineEnding.LF | ||
|
|
||
| def __init__(self, opt_name, line_ending_enum): | ||
| self.opt_name = opt_name | ||
| self.line_ending_enum = line_ending_enum | ||
|
|
||
|
|
||
| class MixedLineDetection(Enum): | ||
| NOT_MIXED = 1, False, None | ||
| UNKNOWN = 2, False, None | ||
| MIXED_MOSTLY_CRLF = 3, True, LineEnding.CRLF | ||
| MIXED_MOSTLY_LF = 4, True, LineEnding.LF | ||
| MIXED_MOSTLY_CR = 5, True, LineEnding.CR | ||
|
|
||
| def __init__(self, index, mle_found, line_ending_enum): | ||
| # TODO hack to prevent enum overriding | ||
| self.index = index | ||
| self.mle_found = mle_found | ||
| self.line_ending_enum = line_ending_enum | ||
|
|
||
|
|
||
| ANY_LINE_ENDING_PATTERN = re.compile( | ||
| b'(' + LineEnding.CRLF.regex.pattern + | ||
| b'|' + LineEnding.LF.regex.pattern + | ||
| b'|' + LineEnding.CR.regex.pattern + b')', | ||
| ) | ||
|
|
||
|
|
||
| def mixed_line_ending(argv=None): | ||
| options = _parse_arguments(argv) | ||
|
|
||
| filenames = options['filenames'] | ||
| fix_option = options['fix'] | ||
|
|
||
| if fix_option == MixedLineEndingOption.NO: | ||
| return _process_no_fix(filenames) | ||
| elif fix_option == MixedLineEndingOption.AUTO: | ||
| return _process_fix_auto(filenames) | ||
| # when a line ending character is forced with --fix option | ||
| else: | ||
| return _process_fix_force(filenames, fix_option.line_ending_enum) | ||
|
|
||
|
|
||
| def _parse_arguments(argv=None): | ||
| parser = argparse.ArgumentParser() | ||
| parser.add_argument( | ||
| '-f', | ||
| '--fix', | ||
| choices=[m.opt_name for m in MixedLineEndingOption], | ||
| default=MixedLineEndingOption.AUTO.opt_name, | ||
| help='Replace line ending with the specified. Default is "auto"', | ||
| ) | ||
| parser.add_argument('filenames', nargs='*', help='Filenames to fix') | ||
| args = parser.parse_args(argv) | ||
|
|
||
| fix, = ( | ||
| member for name, member | ||
| in MixedLineEndingOption.__members__.items() | ||
| if member.opt_name == args.fix | ||
| ) | ||
|
|
||
| options = { | ||
| 'fix': fix, 'filenames': args.filenames, | ||
| } | ||
|
|
||
| return options | ||
|
|
||
|
|
||
| def _detect_line_ending(filename): | ||
| with open(filename, 'rb') as f: | ||
| buf = f.read() | ||
|
|
||
| le_counts = {} | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can unindent after here, as we've read the entire file by this point and no longer need the file object around (And you can regain a level of indentation)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You are right. Done. I was wondering actually: is it a good practice to read the entire file?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In this case I think it's fine. If it's checked into git there's a reasonable expectation that the file is a manageable size. Most of the other hooks do the same |
||
|
|
||
| for le_enum in LineEnding: | ||
| le_counts[le_enum] = len(le_enum.regex.findall(buf)) | ||
|
|
||
| mixed = False | ||
| le_found_previously = False | ||
| most_le = None | ||
| max_le_count = 0 | ||
|
|
||
| for le, le_count in le_counts.items(): | ||
| le_found_cur = le_count > 0 | ||
|
|
||
| mixed |= le_found_previously and le_found_cur | ||
| le_found_previously |= le_found_cur | ||
|
|
||
| if le_count == max_le_count: | ||
| most_le = None | ||
| elif le_count > max_le_count: | ||
| max_le_count = le_count | ||
| most_le = le | ||
|
|
||
| if not mixed: | ||
| return MixedLineDetection.NOT_MIXED | ||
|
|
||
| for mld in MixedLineDetection: | ||
| if ( | ||
| mld.line_ending_enum is not None and | ||
| mld.line_ending_enum == most_le | ||
| ): | ||
| return mld | ||
|
|
||
| return MixedLineDetection.UNKNOWN | ||
|
|
||
|
|
||
| def _process_no_fix(filenames): | ||
| print('Checking if the files have mixed line ending.') | ||
|
|
||
| mle_filenames = [] | ||
| for filename in filenames: | ||
| detect_result = _detect_line_ending(filename) | ||
|
|
||
| if detect_result.mle_found: | ||
| mle_filenames.append(filename) | ||
|
|
||
| mle_found = len(mle_filenames) > 0 | ||
|
|
||
| if mle_found: | ||
| print( | ||
| 'The following files have mixed line endings:\n\t%s', | ||
| '\n\t'.join(mle_filenames), | ||
| ) | ||
|
|
||
| return 1 if mle_found else 0 | ||
|
|
||
|
|
||
| def _process_fix_auto(filenames): | ||
| mle_found = False | ||
|
|
||
| for filename in filenames: | ||
| detect_result = _detect_line_ending(filename) | ||
|
|
||
| if detect_result == MixedLineDetection.NOT_MIXED: | ||
| print('The file %s has no mixed line ending', filename) | ||
| elif detect_result == MixedLineDetection.UNKNOWN: | ||
| print( | ||
| 'Could not define most frequent line ending in ' | ||
| 'file %s. File skiped.', filename, | ||
| ) | ||
|
|
||
| mle_found = True | ||
| else: | ||
| le_enum = detect_result.line_ending_enum | ||
|
|
||
| print( | ||
| 'The file %s has mixed line ending with a ' | ||
| 'majority of %s. Converting...', filename, le_enum.str_print, | ||
| ) | ||
|
|
||
| _convert_line_ending(filename, le_enum.string) | ||
| mle_found = True | ||
|
|
||
| print( | ||
| 'The file %s has been converted to %s line ending.', | ||
| filename, le_enum.str_print, | ||
| ) | ||
|
|
||
| return 1 if mle_found else 0 | ||
|
|
||
|
|
||
| def _process_fix_force(filenames, line_ending_enum): | ||
| for filename in filenames: | ||
| _convert_line_ending(filename, line_ending_enum.string) | ||
|
|
||
| print( | ||
| 'The file %s has been forced to %s line ending.', | ||
| filename, line_ending_enum.str_print, | ||
| ) | ||
|
|
||
| return 1 | ||
|
|
||
|
|
||
| def _convert_line_ending(filename, line_ending): | ||
| with open(filename, 'rb+') as f: | ||
| bufin = f.read() | ||
|
|
||
| # convert line ending | ||
| bufout = ANY_LINE_ENDING_PATTERN.sub(line_ending, bufin) | ||
|
|
||
| # write the result in the file replacing the existing content | ||
| f.seek(0) | ||
| f.write(bufout) | ||
| f.truncate() | ||
|
|
||
|
|
||
| if __name__ == '__main__': | ||
| sys.exit(mixed_line_ending()) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| This line ends with 'LF' | ||
| This line ends with 'CRLF' | ||
| This line ends with 'LF' | ||
| This line ends with 'CRLF' | ||
| This line ends with 'LF' | ||
| This line ends with 'CRLF' | ||
| This line ends with 'LF' | ||
| This line ends with 'CRLF' | ||
| This line ends with 'LF' | ||
| This line ends with 'CRLF' | ||
| This line ends with 'LF' |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ah, we could remove the equality hack if we do something simple like "
lfwins on ties" or something. Just an ideaThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Well, isn't it platform-related? I mean, Windows users would not appreciate to have their files changed in
lffile ending.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if they're 50/50 I think it's probably fine?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually,
mixed_line_ending.pyis able to detectLF,CRLF, andCR. So it would be 33% each.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
that makes it even more rare, I'd say just pick one of them if there's ties (since it also simplifies other code elsewhere iirc)
Up to you though, this is fine as is :)